[llvm-branch-commits] [llvm] [AMDGPU] Adopt new lowering sequence for `fdiv16` (PR #109295)

Shilei Tian via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Sep 30 11:26:38 PDT 2024


https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/109295

>From 2eb020eea83b8b806b1e0e05d65a7a79f5bf0cea Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 19 Sep 2024 10:57:27 -0400
Subject: [PATCH] [AMDGPU] Adapt new lowering sequence for `fdiv16`

The current lowering of fdiv16 can generate incorrectly rounded result in some
cases.

Fixes SWDEV-47760.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   36 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   53 +-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll     | 3095 +++++++++++++----
 llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll   |   82 +-
 .../AMDGPU/GlobalISel/legalize-fdiv.mir       |  478 ++-
 llvm/test/CodeGen/AMDGPU/fdiv.f16.ll          |   54 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |   71 +-
 llvm/test/CodeGen/AMDGPU/frem.ll              |  670 +++-
 8 files changed, 3507 insertions(+), 1032 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 271c8d45fd4a21..53f096cf33b710 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4903,16 +4903,40 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
   LLT S16 = LLT::scalar(16);
   LLT S32 = LLT::scalar(32);
 
+  // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
+  // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
+  // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
+  // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
+  // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+  // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
+  // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+  // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
+  // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
+  // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
+  // q16.u = opx(V_CVT_F16_F32, q32.u);
+  // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
+
   auto LHSExt = B.buildFPExt(S32, LHS, Flags);
   auto RHSExt = B.buildFPExt(S32, RHS, Flags);
-
-  auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+  auto NegRHSExt = B.buildFNeg(S32, RHSExt);
+  auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
                  .addUse(RHSExt.getReg(0))
                  .setMIFlags(Flags);
-
-  auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
-  auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
-
+  auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
+  MachineInstrBuilder Err;
+  if (ST.hasMadMacF32Insts()) {
+    Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+    Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
+    Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+  } else {
+    Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
+    Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
+    Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
+  }
+  auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
+  Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
+  Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
+  auto RDst = B.buildFPTrunc(S16, Quot, Flags);
   B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
       .addUse(RDst.getReg(0))
       .addUse(RHS)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d559d0446b9d8f..ec082d64bf0d22 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10619,19 +10619,48 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
     return FastLowered;
 
   SDLoc SL(Op);
-  SDValue Src0 = Op.getOperand(0);
-  SDValue Src1 = Op.getOperand(1);
-
-  SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
-  SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
-
-  SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
-  SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
-
-  SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
-  SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
 
-  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
+  // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
+  // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
+  // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
+  // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
+  // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+  // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
+  // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+  // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
+  // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
+  // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
+  // q16.u = opx(V_CVT_F16_F32, q32.u);
+  // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
+
+  // We will use ISD::FMA on targets that don't support ISD::FMAD.
+  unsigned FMADOpCode =
+      isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
+
+  SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
+  SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
+  SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
+  SDValue Rcp =
+      DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
+  SDValue Quot =
+      DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
+  SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
+                            Op->getFlags());
+  Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
+  Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
+                    Op->getFlags());
+  SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
+  SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
+  TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
+                        DAG.getConstant(0xff800000, SL, MVT::i32));
+  Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
+  Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
+  SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
+                             DAG.getConstant(0, SL, MVT::i32));
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
+                     Op->getFlags());
 }
 
 // Faster 2.5 ULP division that does not support denormals.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1a98285230b2cd..5ba036c386a402 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -57,24 +57,59 @@ define half @v_fdiv_f16(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fdiv_f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v6, -v2, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v6, -v2, v5, v3
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v2, -v2, v5, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_f16:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v6, -v2, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
@@ -83,27 +118,71 @@ define half @v_fdiv_f16(half %a, half %b) {
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v3, v4, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fdiv_f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v6, -v2, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX10-FLUSH-NEXT:    v_mad_f32 v6, -v2, v5, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v3
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v5, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v2
+; GFX11-NEXT:    v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX11-NEXT:    v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv half %a, %b
@@ -188,24 +267,59 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fdiv_f16_ulp25:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_f16_ulp25:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v6, -v2, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_f16_ulp25:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v6, -v2, v5, v3
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v2, -v2, v5, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v6, -v2, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v6, v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
@@ -214,27 +328,71 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v3, v4, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fdiv_f16_ulp25:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_f16_ulp25:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v6, -v2, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_f16_ulp25:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX10-FLUSH-NEXT:    v_mad_f32 v6, -v2, v5, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v3
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v5, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_f16_ulp25:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v2
+; GFX11-NEXT:    v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX11-NEXT:    v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv half %a, %b
@@ -670,44 +828,113 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fdiv_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX8-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX8-NEXT:    v_mul_f32_e32 v3, v6, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, v7, v5
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX8-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_v2f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v8, v6
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v7, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v2, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v4, v8
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v7, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v9, -v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v9, v9, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v4
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_v2f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v8, v6
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v7, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v10, -v2, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v10, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v2, -v2, v9, v4
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v4, v8
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v5, v7, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v9, -v8, v5, v7
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v5, v9, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v7, -v8, v5, v7
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v4, v7, v4
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v6, v3
-; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v7, v5
-; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
-; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v8, v6
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v7, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v10, -v2, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v10, v10, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v4, v8
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v7, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v9, -v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v9, v9, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v8, -v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v4
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v4, v6, v3
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -715,33 +942,103 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v7, v5
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, v0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v3, v5
-; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v8, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v6, v6, v7
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v4, v6, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v4, v5, v3
+; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fdiv_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_v2f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v9, v5
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v7, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v10, v8, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v9, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v12, -v3, v10
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v13, -v4, v11
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v12, v12, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v13, v13, v9
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v12, v12, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v13, v13, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v10, v12, v10
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v11, v13, v11
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v10
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v4, -v4, v11
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v7
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v4, v4, v11
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v9, v5
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v7, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v10, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v11, v9, v7
+; GFX10-FLUSH-NEXT:    v_mad_f32 v12, -v3, v10, v8
+; GFX10-FLUSH-NEXT:    v_mad_f32 v13, -v4, v11, v9
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v10, v12, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v11, v13, v7
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v10, v8
+; GFX10-FLUSH-NEXT:    v_mad_f32 v4, -v4, v11, v9
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v7
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v4, v4, v11
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16:
 ; GFX11:       ; %bb.0:
@@ -749,12 +1046,24 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v5
 ; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
+; GFX11-NEXT:    v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4
+; GFX11-NEXT:    v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v7
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
@@ -897,44 +1206,113 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_fdiv_v2f16_ulp25:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX8-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX8-NEXT:    v_mul_f32_e32 v3, v6, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v5, v7, v5
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX8-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_v2f16_ulp25:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v8, v6
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v7, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v2, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v4, v8
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v7, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v9, -v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v9, v9, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v4
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v8, v6
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v7, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v10, -v2, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v10, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v2, -v2, v9, v4
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v4, v8
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v5, v7, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v9, -v8, v5, v7
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v5, v9, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v7, -v8, v5, v7
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v4, v7, v4
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, v4
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, v0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v7, v2
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v6, v3
-; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v7, v5
-; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
-; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v5, v4, v2
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v8, v6
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v7, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v10, -v2, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v10, v10, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v9, v10, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v4, v8
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v7, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v9, -v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v9, v9, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v9, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v8, -v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v4
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v4, v4, v5
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v4, v6, v3
 ; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -942,33 +1320,103 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v7, v5
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, v0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v3, v5
-; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v8, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v6, v6, v7
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v4, v6, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v4, v5, v3
+; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fdiv_v2f16_ulp25:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_ulp25:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v9, v5
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v7, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v10, v8, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v11, v9, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v12, -v3, v10
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v13, -v4, v11
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v12, v12, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v13, v13, v9
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v12, v12, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v13, v13, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v10, v12, v10
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v11, v13, v11
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v10
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v4, -v4, v11
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v4, v4, v9
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v7
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v4, v4, v11
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v8, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v9, v5
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v7, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v10, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v11, v9, v7
+; GFX10-FLUSH-NEXT:    v_mad_f32 v12, -v3, v10, v8
+; GFX10-FLUSH-NEXT:    v_mad_f32 v13, -v4, v11, v9
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v10, v12, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v11, v13, v7
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v10, v8
+; GFX10-FLUSH-NEXT:    v_mad_f32 v4, -v4, v11, v9
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v7
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v10
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v4, v4, v11
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fdiv_v2f16_ulp25:
 ; GFX11:       ; %bb.0:
@@ -976,12 +1424,24 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v5
 ; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
+; GFX11-NEXT:    v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4
+; GFX11-NEXT:    v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v7
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v3, v1, v0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
@@ -1061,36 +1521,103 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_rcp_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rcp_v2f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rcp_v2f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rcp_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1104,43 +1631,122 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, 1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_rcp_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rcp_v2f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rcp_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1218,36 +1824,103 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_neg_rcp_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_neg_rcp_v2f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1261,43 +1934,122 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, -1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_neg_rcp_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_neg_rcp_v2f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_neg_rcp_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rcp_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -1385,38 +2137,106 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_rcp_v2f16_fabs:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rcp_v2f16_fabs:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rcp_v2f16_fabs:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rcp_v2f16_fabs:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1427,50 +2247,131 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX9-FLUSH-LABEL: v_rcp_v2f16_fabs:
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, 1.0
-; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, 1.0
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v5, v4
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v6, v7, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v7, v4
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v3, 1.0
+; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_rcp_v2f16_fabs:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rcp_v2f16_fabs:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rcp_v2f16_fabs:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_fabs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, 1.0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT:    v_mul_f32_e32 v6, v5, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v3
+; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT:    v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT:    v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_div_fixup_f16 v0, v0, v2, 1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
   %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x.fabs
@@ -1558,38 +2459,106 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_neg_rcp_v2f16_fabs:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1600,50 +2569,131 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
 ; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, -1.0
-; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, -1.0
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v5, v4
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v6, v7, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v7, v4
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v3, -1.0
+; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_neg_rcp_v2f16_fabs:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, -1.0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fff7fff, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT:    v_mul_f32_e32 v6, v5, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT:    v_fmac_f32_e32 v6, v7, v3
+; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT:    v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT:    v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_div_fixup_f16 v0, v0, v2, -1.0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
   %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x.fabs
@@ -1881,36 +2931,103 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_rcp_v2f16_ulp25:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rcp_v2f16_ulp25:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rcp_v2f16_ulp25:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1924,43 +3041,122 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, 1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v2, 1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_rcp_v2f16_ulp25:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rcp_v2f16_ulp25:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rcp_v2f16_ulp25:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rcp_v2f16_ulp25:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -2251,24 +3447,60 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
 ; GFX6-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX6-FLUSH-NEXT:    ; return to shader part epilog
 ;
-; GFX8-LABEL: s_fdiv_f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, s1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, s0
-; GFX8-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_div_fixup_f16 v0, v0, v1, s0
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-IEEE-LABEL: s_fdiv_f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v4, -v0, v3
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v4, v4, v1
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v3
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-IEEE-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v0, v1, s0
+; GFX8-IEEE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-IEEE-NEXT:    ; return to shader part epilog
+;
+; GFX8-FLUSH-LABEL: s_fdiv_f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GFX8-FLUSH-NEXT:    v_mad_f32 v4, -v0, v3, v1
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v3, v4, v2
+; GFX8-FLUSH-NEXT:    v_mad_f32 v0, -v0, v3, v1
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v1, s0
+; GFX8-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-FLUSH-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-IEEE-LABEL: s_fdiv_f16:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, s1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, s0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v4, -v0, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v4, v4, v1
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v0, v0, v3
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX9-IEEE-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v0, v1, s0
@@ -2278,28 +3510,72 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
 ; GFX9-FLUSH-LABEL: s_fdiv_f16:
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, s1
-; GFX9-FLUSH-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX9-FLUSH-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v1, s0
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v1, v3, v0
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v0, v3, v0
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v2, s0
 ; GFX9-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-FLUSH-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_fdiv_f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, s1
-; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX10-NEXT:    v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-IEEE-LABEL: s_fdiv_f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v4, -v0, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v1
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-IEEE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-IEEE-NEXT:    ; return to shader part epilog
+;
+; GFX10-FLUSH-LABEL: s_fdiv_f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v2, v1
+; GFX10-FLUSH-NEXT:    v_mad_f32 v4, -v0, v3, v2
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v3, v4, v1
+; GFX10-FLUSH-NEXT:    v_mad_f32 v0, -v0, v3, v2
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-FLUSH-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fdiv_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s0
 ; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT:    v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v1, v2, v0
+; GFX11-NEXT:    v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -2499,42 +3775,113 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
 ; GFX6-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX6-FLUSH-NEXT:    ; return to shader part epilog
 ;
-; GFX8-LABEL: s_fdiv_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, s1
-; GFX8-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, s3
-; GFX8-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; GFX8-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, s2
-; GFX8-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT:    v_mov_b32_e32 v2, s1
-; GFX8-NEXT:    v_div_fixup_f16 v0, v0, v2, s0
-; GFX8-NEXT:    v_mov_b32_e32 v2, s3
-; GFX8-NEXT:    v_div_fixup_f16 v1, v1, v2, s2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-IEEE-LABEL: s_fdiv_v2f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX8-IEEE-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, s3
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX8-IEEE-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, s2
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v1, v2
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v6, -v0, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v1, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v1
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v5, -v4, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v1
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v4, -v4, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v0, v2, s0
+; GFX8-IEEE-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v1, v2, s2
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-IEEE-NEXT:    ; return to shader part epilog
+;
+; GFX8-FLUSH-LABEL: s_fdiv_v2f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX8-FLUSH-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s3
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX8-FLUSH-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, s2
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v5, v1, v2
+; GFX8-FLUSH-NEXT:    v_mad_f32 v6, -v0, v5, v1
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v2
+; GFX8-FLUSH-NEXT:    v_mad_f32 v0, -v0, v5, v1
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v1, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v1
+; GFX8-FLUSH-NEXT:    v_mad_f32 v5, -v4, v2, v3
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v2, v5, v1
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v4, v2, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v2, s0
+; GFX8-FLUSH-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, s2
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-FLUSH-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-IEEE-LABEL: s_fdiv_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, s0
 ; GFX9-IEEE-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, s3
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-IEEE-NEXT:    s_lshr_b32 s2, s0, 16
-; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, s2
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v1, v2
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v6, -v0, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v6, v6, v1
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v6, v6, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v5
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v1, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v1
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v5, -v4, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v5, v5, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v5, v5, v1
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v5, v2
+; GFX9-IEEE-NEXT:    v_mul_f32_e64 v4, -v4, v2
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v4, v3
 ; GFX9-IEEE-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-IEEE-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v0, v2, s0
@@ -2547,36 +3894,106 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
 ; GFX9-FLUSH-LABEL: s_fdiv_v2f16:
 ; GFX9-FLUSH:       ; %bb.0:
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, s1
-; GFX9-FLUSH-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s2
-; GFX9-FLUSH-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s0
+; GFX9-FLUSH-NEXT:    s_lshr_b32 s3, s1, 16
 ; GFX9-FLUSH-NEXT:    v_mov_b32_e32 v2, s1
-; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, s3
+; GFX9-FLUSH-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v0
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v1, v4, v0
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v0, v4, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s2
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX9-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v5, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-FLUSH-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, v2, s3
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, s2
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-FLUSH-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_fdiv_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v0, s1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, s2
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v1, s2, s3
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-IEEE-LABEL: s_fdiv_v2f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; GFX10-IEEE-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, s0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v5, s3
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v6, v4, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v7, v5, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v8, -v0, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v9, -v1, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v9, v9, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v6, v8, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v6
+; GFX10-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v7
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v0, v0, v4
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v0, v0, v6
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v1, s2, s3
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-IEEE-NEXT:    ; return to shader part epilog
+;
+; GFX10-FLUSH-LABEL: s_fdiv_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_lshr_b32 s2, s1, 16
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; GFX10-FLUSH-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, s0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v5, s3
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v2, v0
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v6, v4, v2
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v5, v3
+; GFX10-FLUSH-NEXT:    v_mad_f32 v8, -v0, v6, v4
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v1, v7, v5
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v6, v8, v2
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v3
+; GFX10-FLUSH-NEXT:    v_mad_f32 v0, -v0, v6, v4
+; GFX10-FLUSH-NEXT:    v_mad_f32 v1, -v1, v7, v5
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v0, v0, v6
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v1, s2, s3
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-FLUSH-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fdiv_v2f16:
 ; GFX11:       ; %bb.0:
@@ -2584,13 +4001,25 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, s2
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, s3
 ; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
+; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v0 :: v_dual_mul_f32 v3, v3, v1
+; GFX11-NEXT:    v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_fmac_f32 v2, v4, v0 :: v_dual_fmac_f32 v3, v5, v1
+; GFX11-NEXT:    v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xff800000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v1, s2, s3
+; GFX11-NEXT:    v_div_fixup_f16 v0, v0, s1, s0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -2896,26 +4325,77 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
 ; GFX6-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX6-FLUSH-NEXT:    ; return to shader part epilog
 ;
-; GFX8-LABEL: s_rsq_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    v_sqrt_f16_e32 v0, s0
-; GFX8-NEXT:    s_lshr_b32 s0, s0, 16
-; GFX8-NEXT:    v_sqrt_f16_e32 v1, s0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    ; return to shader part epilog
+; GFX8-IEEE-LABEL: s_rsq_v2f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    v_sqrt_f16_e32 v0, s0
+; GFX8-IEEE-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-IEEE-NEXT:    v_sqrt_f16_e32 v1, s0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v8, -v2, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v10, v6
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v5, v8, v9
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-IEEE-NEXT:    ; return to shader part epilog
+;
+; GFX8-FLUSH-LABEL: s_rsq_v2f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    v_sqrt_f16_e32 v0, s0
+; GFX8-FLUSH-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX8-FLUSH-NEXT:    v_sqrt_f16_e32 v1, s0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v2, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX8-FLUSH-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-IEEE-LABEL: s_rsq_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
@@ -2925,11 +4405,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_fma_f32 v8, -v2, v7, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
+; GFX9-IEEE-NEXT:    v_fma_f32 v8, -v3, v9, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v8, v8, v6, v9
+; GFX9-IEEE-NEXT:    v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
@@ -2942,50 +4434,125 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v0, s0
 ; GFX9-FLUSH-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, s0
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v6, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX9-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-FLUSH-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_rsq_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX10-NEXT:    v_sqrt_f16_e32 v0, s0
-; GFX10-NEXT:    v_sqrt_f16_e32 v1, s1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX10-NEXT:    ; return to shader part epilog
+; GFX10-IEEE-LABEL: s_rsq_v2f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX10-IEEE-NEXT:    v_sqrt_f16_e32 v0, s0
+; GFX10-IEEE-NEXT:    v_sqrt_f16_e32 v1, s1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-IEEE-NEXT:    ; return to shader part epilog
+;
+; GFX10-FLUSH-LABEL: s_rsq_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX10-FLUSH-NEXT:    v_sqrt_f16_e32 v0, s0
+; GFX10-FLUSH-NEXT:    v_sqrt_f16_e32 v1, s1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-FLUSH-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_rsq_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, s0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v1, s1
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -3876,25 +5443,75 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_rsq_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sqrt_f16_e32 v1, v0
-; GFX8-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rsq_v2f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX8-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v8
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v9, -v2, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v10, v8
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v8
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rsq_v2f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v8, v4, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_rsq_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
@@ -3904,10 +5521,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v8, v4, v6
+; GFX9-IEEE-NEXT:    v_fma_f32 v9, -v2, v7, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v10, -v3, v8, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v7, v9, v5, v7
+; GFX9-IEEE-NEXT:    v_fma_f32 v8, v10, v6, v8
+; GFX9-IEEE-NEXT:    v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
@@ -3920,38 +5549,100 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v7, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v6, v2
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_rsq_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sqrt_f16_e32 v1, v0
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
-; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rsq_v2f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX10-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rsq_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX10-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_rsq_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -3959,10 +5650,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -4054,25 +5757,75 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX8-LABEL: v_neg_rsq_v2f16:
-; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_sqrt_f16_e32 v1, v0
-; GFX8-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX8-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
-; GFX8-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT:    s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX8-IEEE:       ; %bb.0:
+; GFX8-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX8-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v8, v4, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v10, -v3, v8
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v9, -v2, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v10, v10, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v9, v9, v5
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v8, v10, v8
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v7, v9, v7
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v3, -v3, v8
+; GFX8-IEEE-NEXT:    v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX8-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX8-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX8-IEEE-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-IEEE-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX8-FLUSH:       ; %bb.0:
+; GFX8-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v8, v4, v6
+; GFX8-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v4
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v6
+; GFX8-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v5
+; GFX8-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v4
+; GFX8-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX8-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX8-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX8-FLUSH-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-FLUSH-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
 ; GFX9-IEEE:       ; %bb.0:
@@ -4082,10 +5835,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v8, v4, v6
+; GFX9-IEEE-NEXT:    v_fma_f32 v9, -v2, v7, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v10, -v3, v8, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v7, v9, v5, v7
+; GFX9-IEEE-NEXT:    v_fma_f32 v8, v10, v6, v8
+; GFX9-IEEE-NEXT:    v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT:    v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT:    v_add_f32_e32 v3, v3, v8
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
@@ -4098,38 +5863,100 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
 ; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT:    v_mac_f32_e32 v4, v7, v3
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v2, v6, v2
+; GFX9-FLUSH-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX9-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
 ; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
 ; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
 ; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_neg_rsq_v2f16:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_sqrt_f16_e32 v1, v0
-; GFX10-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
-; GFX10-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
-; GFX10-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX10-IEEE:       ; %bb.0:
+; GFX10-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX10-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX10-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX10-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX10-FLUSH:       ; %bb.0:
+; GFX10-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX10-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT:    v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT:    v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT:    v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT:    v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX10-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX10-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX10-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_neg_rsq_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
 ; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v0
@@ -4137,10 +5964,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX11-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT:    v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v0, -1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -4154,9 +5993,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
 declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10-FLUSH: {{.*}}
-; GFX10-IEEE: {{.*}}
 ; GFX11-FLUSH: {{.*}}
 ; GFX11-IEEE: {{.*}}
-; GFX8-FLUSH: {{.*}}
-; GFX8-IEEE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e051cc28469fae..8409e9c88aadaa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -46,8 +46,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_rcp_f32_e32 v2, v2
-; VI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; VI-NEXT:    v_rcp_f32_e32 v3, v2
+; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
+; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
+; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
+; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
+; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
+; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; VI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
 ; VI-NEXT:    v_trunc_f16_e32 v0, v0
@@ -554,19 +560,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s0
 ; VI-NEXT:    s_lshr_b32 s3, s0, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_rcp_f32_e32 v2, v2
 ; VI-NEXT:    s_lshr_b32 s1, s2, 16
-; VI-NEXT:    v_rcp_f32_e32 v3, v3
-; VI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; VI-NEXT:    v_rcp_f32_e32 v3, v2
+; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
+; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
+; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
+; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
+; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
+; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; VI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_f16_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
 ; VI-NEXT:    v_trunc_f16_e32 v0, v0
 ; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s1
-; VI-NEXT:    v_mul_f32_e32 v1, v1, v3
+; VI-NEXT:    v_rcp_f32_e32 v4, v3
+; VI-NEXT:    v_mul_f32_e32 v5, v1, v4
+; VI-NEXT:    v_mad_f32 v6, -v3, v5, v1
+; VI-NEXT:    v_mac_f32_e32 v5, v6, v4
+; VI-NEXT:    v_mad_f32 v1, -v3, v5, v1
+; VI-NEXT:    v_mul_f32_e32 v1, v1, v4
+; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT:    v_add_f32_e32 v1, v1, v5
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s1
 ; VI-NEXT:    v_trunc_f16_e32 v1, v1
@@ -691,41 +709,65 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s0
 ; VI-NEXT:    s_lshr_b32 s8, s0, 16
-; VI-NEXT:    v_cvt_f32_f16_e32 v3, s8
 ; VI-NEXT:    v_mov_b32_e32 v1, s0
-; VI-NEXT:    v_rcp_f32_e32 v2, v2
 ; VI-NEXT:    s_lshr_b32 s6, s2, 16
-; VI-NEXT:    v_rcp_f32_e32 v3, v3
-; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
-; VI-NEXT:    v_mul_f32_e32 v0, v0, v2
+; VI-NEXT:    v_rcp_f32_e32 v3, v2
+; VI-NEXT:    s_lshr_b32 s9, s1, 16
+; VI-NEXT:    s_lshr_b32 s7, s3, 16
+; VI-NEXT:    v_mul_f32_e32 v4, v0, v3
+; VI-NEXT:    v_mad_f32 v5, -v2, v4, v0
+; VI-NEXT:    v_mac_f32_e32 v4, v5, v3
+; VI-NEXT:    v_mad_f32 v0, -v2, v4, v0
+; VI-NEXT:    v_mul_f32_e32 v0, v0, v3
+; VI-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; VI-NEXT:    v_add_f32_e32 v0, v0, v4
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT:    v_cvt_f32_f16_e32 v3, s8
 ; VI-NEXT:    v_mov_b32_e32 v2, s8
-; VI-NEXT:    v_rcp_f32_e32 v4, v4
-; VI-NEXT:    s_lshr_b32 s9, s1, 16
 ; VI-NEXT:    v_div_fixup_f16 v0, v0, v1, s2
 ; VI-NEXT:    v_trunc_f16_e32 v0, v0
 ; VI-NEXT:    v_fma_f16 v0, -v0, v1, s2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s6
-; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT:    s_lshr_b32 s7, s3, 16
-; VI-NEXT:    v_mul_f32_e32 v1, v1, v3
+; VI-NEXT:    v_rcp_f32_e32 v4, v3
+; VI-NEXT:    v_mul_f32_e32 v5, v1, v4
+; VI-NEXT:    v_mad_f32 v6, -v3, v5, v1
+; VI-NEXT:    v_mac_f32_e32 v5, v6, v4
+; VI-NEXT:    v_mad_f32 v1, -v3, v5, v1
+; VI-NEXT:    v_mul_f32_e32 v1, v1, v4
+; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT:    v_add_f32_e32 v1, v1, v5
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT:    v_cvt_f32_f16_e32 v4, s1
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_rcp_f32_e32 v5, v5
 ; VI-NEXT:    v_div_fixup_f16 v1, v1, v2, s6
 ; VI-NEXT:    v_trunc_f16_e32 v1, v1
 ; VI-NEXT:    v_fma_f16 v1, -v1, v2, s6
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT:    v_rcp_f32_e32 v5, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_mul_f32_e32 v2, v2, v4
+; VI-NEXT:    v_mul_f32_e32 v6, v2, v5
+; VI-NEXT:    v_mad_f32 v7, -v4, v6, v2
+; VI-NEXT:    v_mac_f32_e32 v6, v7, v5
+; VI-NEXT:    v_mad_f32 v2, -v4, v6, v2
+; VI-NEXT:    v_mul_f32_e32 v2, v2, v5
+; VI-NEXT:    v_and_b32_e32 v2, 0xff800000, v2
+; VI-NEXT:    v_add_f32_e32 v2, v2, v6
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT:    v_cvt_f32_f16_e32 v5, s9
 ; VI-NEXT:    v_mov_b32_e32 v4, s9
 ; VI-NEXT:    v_div_fixup_f16 v2, v2, v3, s3
 ; VI-NEXT:    v_trunc_f16_e32 v2, v2
 ; VI-NEXT:    v_fma_f16 v2, -v2, v3, s3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, s7
-; VI-NEXT:    v_mul_f32_e32 v3, v3, v5
+; VI-NEXT:    v_rcp_f32_e32 v6, v5
+; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
+; VI-NEXT:    v_mad_f32 v8, -v5, v7, v3
+; VI-NEXT:    v_mac_f32_e32 v7, v8, v6
+; VI-NEXT:    v_mad_f32 v3, -v5, v7, v3
+; VI-NEXT:    v_mul_f32_e32 v3, v3, v6
+; VI-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; VI-NEXT:    v_add_f32_e32 v3, v3, v7
 ; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; VI-NEXT:    v_div_fixup_f16 v3, v3, v4, s7
 ; VI-NEXT:    v_trunc_f16_e32 v3, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
index e774c2c83dfd8e..1f9c059c2ac60b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
@@ -44,6 +44,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -53,12 +54,24 @@ body: |
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+    ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C]]
+    ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -68,12 +81,24 @@ body: |
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C]]
+    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16
     ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -85,21 +110,6 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]]
     ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16)
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    ; GFX10-LABEL: name: test_fdiv_s16
-    ; GFX10: liveins: $vgpr0, $vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
-    ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
-    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
-    ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
-    ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
-    ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
@@ -141,6 +151,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s32_denorms_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -160,6 +171,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s32_denorms_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -179,6 +191,7 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_on
     ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -187,6 +200,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
     ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]]
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s32_denorms_on
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -246,6 +260,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s32_denorms_off
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -267,6 +282,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s32_denorms_off
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -288,6 +304,7 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off
     ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -296,6 +313,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
     ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]]
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s32_denorms_off
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -357,6 +375,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -378,6 +397,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -399,6 +419,7 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -407,6 +428,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
     ; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -473,6 +495,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; VI-LABEL: name: test_fdiv_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -492,6 +515,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -511,6 +535,7 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -527,6 +552,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]]
     ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -603,6 +629,7 @@ body: |
     ; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fdiv_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -641,6 +668,7 @@ body: |
     ; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fdiv_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -679,6 +707,7 @@ body: |
     ; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -692,6 +721,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[INT1]]
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_fdiv_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -776,6 +806,7 @@ body: |
     ; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fdiv_v2s32_flags
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -810,6 +841,7 @@ body: |
     ; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fdiv_v2s32_flags
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -844,6 +876,7 @@ body: |
     ; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32_flags
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -857,6 +890,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV1]], [[INT1]]
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_fdiv_v2s32_flags
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -949,6 +983,7 @@ body: |
     ; SI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fdiv_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -995,6 +1030,7 @@ body: |
     ; VI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fdiv_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -1041,6 +1077,7 @@ body: |
     ; GFX9-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s32
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -1056,6 +1093,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[INT2]]
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32)
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX10-LABEL: name: test_fdiv_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -1162,6 +1200,7 @@ body: |
     ; SI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_fdiv_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: {{  $}}
@@ -1196,6 +1235,7 @@ body: |
     ; VI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_fdiv_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -1230,6 +1270,7 @@ body: |
     ; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s64
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -1258,6 +1299,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]]
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64)
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX10-LABEL: name: test_fdiv_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX10-NEXT: {{  $}}
@@ -1355,6 +1397,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fdiv_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -1371,15 +1414,36 @@ body: |
     ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
     ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
+    ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+    ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
     ; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+    ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
     ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+    ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+    ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+    ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+    ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+    ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+    ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+    ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+    ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+    ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
     ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
     ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16)
     ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16)
@@ -1387,6 +1451,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fdiv_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1403,18 +1468,40 @@ body: |
     ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
     ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
+    ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+    ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
     ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+    ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+    ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+    ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+    ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+    ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+    ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+    ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+    ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+    ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
     ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s16
     ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -1435,34 +1522,6 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]]
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16)
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
-    ; GFX10-LABEL: name: test_fdiv_v2s16
-    ; GFX10: liveins: $vgpr0, $vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
-    ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
-    ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
-    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
-    ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
-    ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
-    ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
-    ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
     %2:_(<2 x s16>) = G_FDIV %0, %1
@@ -1546,6 +1605,7 @@ body: |
     ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC2]](s16)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fdiv_v3s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -1568,27 +1628,59 @@ body: |
     ; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
     ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+    ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+    ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
     ; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+    ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
     ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+    ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+    ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+    ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+    ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+    ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+    ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+    ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+    ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+    ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
     ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
     ; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
     ; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+    ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
     ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
-    ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
-    ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+    ; VI-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+    ; VI-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+    ; VI-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+    ; VI-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+    ; VI-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+    ; VI-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+    ; VI-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+    ; VI-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+    ; VI-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+    ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
     ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fdiv_v3s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1611,27 +1703,59 @@ body: |
     ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
     ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+    ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+    ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
     ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+    ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+    ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+    ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+    ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+    ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+    ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+    ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+    ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+    ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
     ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
     ; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+    ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
     ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
-    ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
-    ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+    ; GFX9-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+    ; GFX9-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+    ; GFX9-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+    ; GFX9-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+    ; GFX9-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+    ; GFX9-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+    ; GFX9-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+    ; GFX9-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+    ; GFX9-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+    ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
     ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s16
     ; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -1663,49 +1787,6 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL2]](s16)
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; GFX9-UNSAFE-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
-    ; GFX10-LABEL: name: test_fdiv_v3s16
-    ; GFX10: liveins: $vgpr0, $vgpr1
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
-    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
-    ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
-    ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
-    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
-    ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
-    ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
-    ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
-    ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
-    ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
-    ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
-    ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
-    ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
-    ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
-    ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
-    ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
-    ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
-    ; GFX10-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<3 x s16>) = G_IMPLICIT_DEF
     %1:_(<3 x s16>) = G_IMPLICIT_DEF
     %2:_(<3 x s16>) = G_FDIV %0, %1
@@ -1816,6 +1897,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fdiv_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1842,27 +1924,68 @@ body: |
     ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
     ; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+    ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+    ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
     ; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+    ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
     ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+    ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+    ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+    ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+    ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+    ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+    ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+    ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+    ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+    ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
     ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
     ; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
     ; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
+    ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
     ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
-    ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
-    ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+    ; VI-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+    ; VI-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+    ; VI-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+    ; VI-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+    ; VI-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+    ; VI-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+    ; VI-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+    ; VI-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+    ; VI-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+    ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
     ; VI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
     ; VI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
+    ; VI-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]]
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
-    ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
-    ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
+    ; VI-NEXT: [[FMUL15:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
+    ; VI-NEXT: [[FMUL16:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FMUL15]]
+    ; VI-NEXT: [[FADD12:%[0-9]+]]:_(s32) = G_FADD [[FMUL16]], [[FPEXT6]]
+    ; VI-NEXT: [[FMUL17:%[0-9]+]]:_(s32) = G_FMUL [[FADD12]], [[INT6]]
+    ; VI-NEXT: [[FADD13:%[0-9]+]]:_(s32) = G_FADD [[FMUL17]], [[FMUL15]]
+    ; VI-NEXT: [[FMUL18:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FADD13]]
+    ; VI-NEXT: [[FADD14:%[0-9]+]]:_(s32) = G_FADD [[FMUL18]], [[FPEXT6]]
+    ; VI-NEXT: [[FMUL19:%[0-9]+]]:_(s32) = G_FMUL [[FADD14]], [[INT6]]
+    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL19]], [[C1]]
+    ; VI-NEXT: [[FADD15:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FADD13]]
+    ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD15]](s32)
     ; VI-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
     ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16)
     ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16)
@@ -1876,6 +1999,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fdiv_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1902,32 +2026,74 @@ body: |
     ; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
     ; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
     ; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+    ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
     ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+    ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+    ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+    ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+    ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+    ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+    ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+    ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+    ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+    ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
     ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
     ; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+    ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
     ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+    ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+    ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+    ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+    ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+    ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+    ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+    ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+    ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+    ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+    ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
     ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
     ; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
+    ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
     ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
-    ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
-    ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+    ; GFX9-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+    ; GFX9-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+    ; GFX9-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+    ; GFX9-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+    ; GFX9-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+    ; GFX9-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+    ; GFX9-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+    ; GFX9-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+    ; GFX9-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+    ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
     ; GFX9-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
     ; GFX9-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
+    ; GFX9-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]]
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
-    ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
-    ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
+    ; GFX9-NEXT: [[FMUL15:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
+    ; GFX9-NEXT: [[FMUL16:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FMUL15]]
+    ; GFX9-NEXT: [[FADD12:%[0-9]+]]:_(s32) = G_FADD [[FMUL16]], [[FPEXT6]]
+    ; GFX9-NEXT: [[FMUL17:%[0-9]+]]:_(s32) = G_FMUL [[FADD12]], [[INT6]]
+    ; GFX9-NEXT: [[FADD13:%[0-9]+]]:_(s32) = G_FADD [[FMUL17]], [[FMUL15]]
+    ; GFX9-NEXT: [[FMUL18:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FADD13]]
+    ; GFX9-NEXT: [[FADD14:%[0-9]+]]:_(s32) = G_FADD [[FMUL18]], [[FPEXT6]]
+    ; GFX9-NEXT: [[FMUL19:%[0-9]+]]:_(s32) = G_FMUL [[FADD14]], [[INT6]]
+    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL19]], [[C1]]
+    ; GFX9-NEXT: [[FADD15:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FADD13]]
+    ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD15]](s32)
     ; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_v4s16
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -1964,58 +2130,6 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL2]](s16), [[FMUL3]](s16)
     ; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
-    ; GFX10-LABEL: name: test_fdiv_v4s16
-    ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; GFX10-NEXT: {{  $}}
-    ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
-    ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
-    ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
-    ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
-    ; GFX10-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
-    ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
-    ; GFX10-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
-    ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
-    ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
-    ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
-    ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
-    ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
-    ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
-    ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
-    ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
-    ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
-    ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
-    ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
-    ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
-    ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
-    ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
-    ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
-    ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
-    ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
-    ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
-    ; GFX10-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
-    ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
-    ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
-    ; GFX10-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
-    ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
-    ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16)
-    ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
-    ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:_(<4 x s16>) = COPY $vgpr2_vgpr3
     %2:_(<4 x s16>) = G_FDIV %0, %1
@@ -2052,6 +2166,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s16_constant_one_rcp
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2060,6 +2175,7 @@ body: |
     ; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2068,6 +2184,7 @@ body: |
     ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp
     ; GFX9-UNSAFE: liveins: $vgpr0
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -2076,6 +2193,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
     ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2122,6 +2240,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2131,6 +2250,7 @@ body: |
     ; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2140,6 +2260,7 @@ body: |
     ; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
     ; GFX9-UNSAFE: liveins: $vgpr0
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -2149,6 +2270,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
     ; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2190,6 +2312,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s32_constant_one_rcp
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2208,6 +2331,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s32_constant_one_rcp
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2226,12 +2350,14 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_one_rcp
     ; GFX9-UNSAFE: liveins: $vgpr0
     ; GFX9-UNSAFE-NEXT: {{  $}}
     ; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s32)
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s32_constant_one_rcp
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2281,6 +2407,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; VI-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2300,6 +2427,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2319,6 +2447,7 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
     ; GFX9-UNSAFE: liveins: $vgpr0
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -2326,6 +2455,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
     ; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s32)
     ; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2389,6 +2519,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; VI-LABEL: name: test_fdiv_s64_constant_one_rcp
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2407,6 +2538,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s64_constant_one_rcp
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2425,6 +2557,7 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -2440,6 +2573,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
     ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2503,6 +2637,7 @@ body: |
     ; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
     ; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; VI-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2522,6 +2657,7 @@ body: |
     ; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
     ; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; GFX9-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2541,6 +2677,7 @@ body: |
     ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
     ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+    ;
     ; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
     ; GFX9-UNSAFE: liveins: $vgpr0_vgpr1
     ; GFX9-UNSAFE-NEXT: {{  $}}
@@ -2557,6 +2694,7 @@ body: |
     ; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
     ; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
     ; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+    ;
     ; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 7c89efd0a713c1..0c6805e3eba598 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -60,15 +60,21 @@ define amdgpu_kernel void @v_fdiv_f16(
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    flat_load_ushort v2, v[2:3] glc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    v_mov_b32_e32 v6, s5
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v1, v5
 ; GFX8-NEXT:    v_cvt_f32_f16_e32 v0, v2
-; GFX8-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_f16_f32_e32 v6, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v3, v0
+; GFX8-NEXT:    v_mul_f32_e32 v7, v1, v3
+; GFX8-NEXT:    v_mad_f32 v8, -v0, v7, v1
+; GFX8-NEXT:    v_mac_f32_e32 v7, v8, v3
+; GFX8-NEXT:    v_mad_f32 v0, -v0, v7, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v7
+; GFX8-NEXT:    v_cvt_f16_f32_e32 v3, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_div_fixup_f16 v2, v6, v2, v5
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-NEXT:    v_div_fixup_f16 v2, v3, v2, v5
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -82,9 +88,17 @@ define amdgpu_kernel void @v_fdiv_f16(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mac_f32_e32 v4, v5, v3
+; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-NEXT:    v_div_fixup_f16 v1, v3, v2, v1
 ; GFX9-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
@@ -100,9 +114,17 @@ define amdgpu_kernel void @v_fdiv_f16(
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_load_ushort v2, v0, s[0:1] glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX10-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX10-NEXT:    v_mad_f32 v7, -v3, v6, v5
+; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v4
+; GFX10-NEXT:    v_mad_f32 v3, -v3, v6, v5
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX10-NEXT:    v_div_fixup_f16 v1, v3, v2, v1
 ; GFX10-NEXT:    global_store_short v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -120,11 +142,23 @@ define amdgpu_kernel void @v_fdiv_f16(
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_load_u16 v2, v0, s[0:1] glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v3, v5, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v2, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 301299daaa61f4..2eb35977b8160b 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1444,12 +1444,19 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; VI-NEXT:    s_movk_i32 s4, 0x7000
+; VI-NEXT:    s_mov_b32 s4, 0x46000000
 ; VI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT:    v_rcp_f32_e32 v1, v1
-; VI-NEXT:    v_mul_f32_e32 v1, 0x46000000, v1
+; VI-NEXT:    v_rcp_f32_e32 v2, v1
+; VI-NEXT:    v_mul_f32_e32 v3, 0x46000000, v2
+; VI-NEXT:    v_mad_f32 v4, -v1, v3, s4
+; VI-NEXT:    v_mac_f32_e32 v3, v4, v2
+; VI-NEXT:    v_mad_f32 v1, -v1, v3, s4
+; VI-NEXT:    v_mul_f32_e32 v1, v1, v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT:    s_movk_i32 s4, 0x7000
 ; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, s4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1457,12 +1464,18 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e64 v0, v0, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0x46000000
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_fma_mixlo_f16 v1, v1, s4, 0
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX10-NEXT:    v_mul_f32_e32 v3, 0x46000000, v2
+; GFX10-NEXT:    v_mad_f32 v4, -v1, v3, 0x46000000
+; GFX10-NEXT:    v_mac_f32_e32 v3, v4, v2
+; GFX10-NEXT:    v_mad_f32 v1, -v1, v3, 0x46000000
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1478,8 +1491,18 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v1, v1, s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 0x7000
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i32 1, %cnt
@@ -1551,8 +1574,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; VI-NEXT:    v_lshlrev_b16_e64 v0, v0, 1
 ; VI-NEXT:    v_cvt_f16_u16_e32 v0, v0
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT:    v_rcp_f32_e32 v1, v1
-; VI-NEXT:    v_add_f32_e32 v1, v1, v1
+; VI-NEXT:    v_rcp_f32_e32 v2, v1
+; VI-NEXT:    v_add_f32_e32 v3, v2, v2
+; VI-NEXT:    v_mad_f32 v4, -v1, v3, 2.0
+; VI-NEXT:    v_mac_f32_e32 v3, v4, v2
+; VI-NEXT:    v_mad_f32 v1, -v1, v3, 2.0
+; VI-NEXT:    v_mul_f32_e32 v1, v1, v2
+; VI-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT:    v_add_f32_e32 v1, v1, v3
 ; VI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; VI-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -1563,8 +1592,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 1
 ; GFX10-NEXT:    v_cvt_f16_u16_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX10-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v2, v1
+; GFX10-NEXT:    v_add_f32_e32 v3, v2, v2
+; GFX10-NEXT:    v_mad_f32 v4, -v1, v3, 2.0
+; GFX10-NEXT:    v_mac_f32_e32 v3, v4, v2
+; GFX10-NEXT:    v_mad_f32 v1, -v1, v3, 2.0
+; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GFX10-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX10-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1573,13 +1608,23 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 1
+; GFX11-NEXT:    s_mov_b32 s0, 2.0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX11-NEXT:    v_add_f32_e32 v2, v1, v1
+; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v2, v3, v1
+; GFX11-NEXT:    v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v1, v0, 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7c5d73ab66b47a..b3432c457d9a45 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -109,8 +109,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_cvt_f32_f16_e32 v3, v4
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; VI-NEXT:    v_rcp_f32_e32 v5, v5
-; VI-NEXT:    v_mul_f32_e32 v3, v3, v5
+; VI-NEXT:    v_rcp_f32_e32 v6, v5
+; VI-NEXT:    v_mul_f32_e32 v7, v3, v6
+; VI-NEXT:    v_mad_f32 v8, -v5, v7, v3
+; VI-NEXT:    v_mac_f32_e32 v7, v8, v6
+; VI-NEXT:    v_mad_f32 v3, -v5, v7, v3
+; VI-NEXT:    v_mul_f32_e32 v3, v3, v6
+; VI-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; VI-NEXT:    v_add_f32_e32 v3, v3, v7
 ; VI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; VI-NEXT:    v_div_fixup_f16 v3, v3, v2, v4
 ; VI-NEXT:    v_trunc_f16_e32 v3, v3
@@ -126,10 +132,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ushort v2, v0, s[0:1] offset:8
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mac_f32_e32 v3, v5, v4
+; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
 ; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX9-NEXT:    v_fma_f16 v1, -v3, v2, v1
@@ -146,10 +161,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_ushort v1, v0, s[6:7]
 ; GFX10-NEXT:    global_load_ushort v2, v0, s[0:1] offset:8
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX10-NEXT:    v_mad_f32 v7, -v4, v6, v3
+; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v5
+; GFX10-NEXT:    v_mad_f32 v3, -v4, v6, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
 ; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX10-NEXT:    v_fma_f16 v1, -v3, v2, v1
@@ -166,15 +190,28 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GFX11-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
 ; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_fma_f16 v1, -v3, v2, v1
 ; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
@@ -191,16 +228,29 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    global_load_u16 v1, v0, s[6:7]
 ; GFX1150-NEXT:    global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT:    s_waitcnt vmcnt(1)
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v2
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX1150-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1150-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v5, v4
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b16 v0, v1, s[4:5]
 ; GFX1150-NEXT:    s_nop 0
@@ -1974,8 +2024,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT:    v_rcp_f32_e32 v7, v7
-; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
+; VI-NEXT:    v_rcp_f32_e32 v8, v7
+; VI-NEXT:    v_mul_f32_e32 v9, v5, v8
+; VI-NEXT:    v_mad_f32 v10, -v7, v9, v5
+; VI-NEXT:    v_mac_f32_e32 v9, v10, v8
+; VI-NEXT:    v_mad_f32 v5, -v7, v9, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v5, v8
+; VI-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; VI-NEXT:    v_add_f32_e32 v5, v5, v9
 ; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; VI-NEXT:    v_div_fixup_f16 v5, v5, v6, v3
 ; VI-NEXT:    v_trunc_f16_e32 v5, v5
@@ -1983,8 +2039,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; VI-NEXT:    v_cvt_f32_f16_e32 v5, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT:    v_rcp_f32_e32 v6, v6
-; VI-NEXT:    v_mul_f32_e32 v5, v5, v6
+; VI-NEXT:    v_rcp_f32_e32 v7, v6
+; VI-NEXT:    v_mul_f32_e32 v8, v5, v7
+; VI-NEXT:    v_mad_f32 v9, -v6, v8, v5
+; VI-NEXT:    v_mac_f32_e32 v8, v9, v7
+; VI-NEXT:    v_mad_f32 v5, -v6, v8, v5
+; VI-NEXT:    v_mul_f32_e32 v5, v5, v7
+; VI-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; VI-NEXT:    v_add_f32_e32 v5, v5, v8
 ; VI-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; VI-NEXT:    v_div_fixup_f16 v5, v5, v2, v4
 ; VI-NEXT:    v_trunc_f16_e32 v5, v5
@@ -2001,21 +2063,38 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[0:1] offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mac_f32_e32 v3, v5, v4
+; GFX9-NEXT:    v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX9-NEXT:    v_mad_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
 ; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX9-NEXT:    v_mac_f32_e32 v5, v8, v7
 ; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX9-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX9-NEXT:    v_mad_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
-; GFX9-NEXT:    v_trunc_f16_e32 v4, v4
-; GFX9-NEXT:    v_fma_f16 v1, -v4, v2, v1
+; GFX9-NEXT:    v_mad_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
+; GFX9-NEXT:    v_trunc_f16_e32 v1, v1
+; GFX9-NEXT:    v_fma_f16 v1, -v1, v6, v4
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX9-NEXT:    s_endpgm
@@ -2030,18 +2109,35 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dword v1, v0, s[6:7]
 ; GFX10-NEXT:    global_load_dword v2, v0, s[0:1] offset:16
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX10-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX10-NEXT:    v_mad_f32 v7, -v4, v6, v3
+; GFX10-NEXT:    v_mac_f32_e32 v6, v7, v5
+; GFX10-NEXT:    v_mad_f32 v3, -v4, v6, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
 ; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX10-NEXT:    v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX10-NEXT:    v_mul_f32_e32 v7, v4, v6
+; GFX10-NEXT:    v_mad_f32 v8, -v5, v7, v4
+; GFX10-NEXT:    v_mac_f32_e32 v7, v8, v6
+; GFX10-NEXT:    v_mad_f32 v4, -v5, v7, v4
+; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v6
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-NEXT:    v_add_f32_e32 v4, v4, v7
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; GFX10-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
 ; GFX10-NEXT:    v_trunc_f16_e32 v4, v4
 ; GFX10-NEXT:    v_fma_f16 v1, -v4, v2, v1
@@ -2059,28 +2155,52 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
 ; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX11-NEXT:    v_rcp_f32_e32 v7, v7
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX11-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v4
+; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
 ; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v8, v7
+; GFX11-NEXT:    v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v7
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
-; GFX11-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v6, v4
+; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f16 v1, -v4, v2, v1
+; GFX11-NEXT:    v_fma_f16 v1, -v1, v6, v4
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v3, v1
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX11-NEXT:    s_nop 0
@@ -2098,31 +2218,55 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    global_load_b32 v1, v0, s[6:7]
 ; GFX1150-NEXT:    global_load_b32 v2, v0, s[0:1] offset:16
 ; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT:    v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v3, v5
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-NEXT:    v_add_f32_e32 v4, v6, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
 ; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v5, v4, v3
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX1150-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-NEXT:    v_fmac_f16_e32 v3, v4, v5
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT:    v_mul_f32_e32 v4, v4, v5
+; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT:    v_fmac_f32_e32 v4, v6, v5
+; GFX1150-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v5
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX1150-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v4, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v3
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[4:5]
 ; GFX1150-NEXT:    s_nop 0
 ; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2364,8 +2508,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
 ; VI-NEXT:    v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT:    v_rcp_f32_e32 v9, v9
-; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
+; VI-NEXT:    v_rcp_f32_e32 v10, v9
+; VI-NEXT:    v_mul_f32_e32 v11, v7, v10
+; VI-NEXT:    v_mad_f32 v12, -v9, v11, v7
+; VI-NEXT:    v_mac_f32_e32 v11, v12, v10
+; VI-NEXT:    v_mad_f32 v7, -v9, v11, v7
+; VI-NEXT:    v_mul_f32_e32 v7, v7, v10
+; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; VI-NEXT:    v_add_f32_e32 v7, v7, v11
 ; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; VI-NEXT:    v_div_fixup_f16 v7, v7, v8, v6
 ; VI-NEXT:    v_trunc_f16_e32 v7, v7
@@ -2373,8 +2523,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_f16_e32 v8, v5
 ; VI-NEXT:    v_cvt_f32_f16_e32 v7, v3
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; VI-NEXT:    v_rcp_f32_e32 v8, v8
-; VI-NEXT:    v_mul_f32_e32 v7, v7, v8
+; VI-NEXT:    v_rcp_f32_e32 v9, v8
+; VI-NEXT:    v_mul_f32_e32 v10, v7, v9
+; VI-NEXT:    v_mad_f32 v11, -v8, v10, v7
+; VI-NEXT:    v_mac_f32_e32 v10, v11, v9
+; VI-NEXT:    v_mad_f32 v7, -v8, v10, v7
+; VI-NEXT:    v_mul_f32_e32 v7, v7, v9
+; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; VI-NEXT:    v_add_f32_e32 v7, v7, v10
 ; VI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; VI-NEXT:    v_div_fixup_f16 v7, v7, v5, v3
 ; VI-NEXT:    v_trunc_f16_e32 v7, v7
@@ -2384,8 +2540,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; VI-NEXT:    v_or_b32_e32 v3, v3, v6
 ; VI-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; VI-NEXT:    v_rcp_f32_e32 v8, v8
-; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
+; VI-NEXT:    v_rcp_f32_e32 v9, v8
+; VI-NEXT:    v_mul_f32_e32 v10, v6, v9
+; VI-NEXT:    v_mad_f32 v11, -v8, v10, v6
+; VI-NEXT:    v_mac_f32_e32 v10, v11, v9
+; VI-NEXT:    v_mad_f32 v6, -v8, v10, v6
+; VI-NEXT:    v_mul_f32_e32 v6, v6, v9
+; VI-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; VI-NEXT:    v_add_f32_e32 v6, v6, v10
 ; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; VI-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
 ; VI-NEXT:    v_trunc_f16_e32 v6, v6
@@ -2393,8 +2555,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_f16_e32 v7, v4
 ; VI-NEXT:    v_cvt_f32_f16_e32 v6, v2
 ; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; VI-NEXT:    v_rcp_f32_e32 v7, v7
-; VI-NEXT:    v_mul_f32_e32 v6, v6, v7
+; VI-NEXT:    v_rcp_f32_e32 v8, v7
+; VI-NEXT:    v_mul_f32_e32 v9, v6, v8
+; VI-NEXT:    v_mad_f32 v10, -v7, v9, v6
+; VI-NEXT:    v_mac_f32_e32 v9, v10, v8
+; VI-NEXT:    v_mad_f32 v6, -v7, v9, v6
+; VI-NEXT:    v_mul_f32_e32 v6, v6, v8
+; VI-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; VI-NEXT:    v_add_f32_e32 v6, v6, v9
 ; VI-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; VI-NEXT:    v_div_fixup_f16 v6, v6, v4, v2
 ; VI-NEXT:    v_trunc_f16_e32 v6, v6
@@ -2411,36 +2579,69 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
 ; GFX9-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v9, v8
+; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX9-NEXT:    v_rcp_f32_e32 v9, v9
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX9-NEXT:    v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mac_f32_e32 v5, v7, v6
+; GFX9-NEXT:    v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX9-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX9-NEXT:    v_mad_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
 ; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX9-NEXT:    v_mac_f32_e32 v7, v10, v9
 ; GFX9-NEXT:    v_fma_f16 v5, -v5, v3, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX9-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX9-NEXT:    v_mad_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
-; GFX9-NEXT:    v_trunc_f16_e32 v6, v6
-; GFX9-NEXT:    v_fma_f16 v1, -v6, v3, v1
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT:    v_mad_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
+; GFX9-NEXT:    v_trunc_f16_e32 v1, v1
+; GFX9-NEXT:    v_fma_f16 v1, -v1, v8, v6
 ; GFX9-NEXT:    v_pack_b32_f16 v1, v5, v1
-; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX9-NEXT:    v_mad_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX9-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT:    v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mac_f32_e32 v3, v6, v5
+; GFX9-NEXT:    v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v8
 ; GFX9-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
+; GFX9-NEXT:    v_mad_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
 ; GFX9-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX9-NEXT:    v_mac_f32_e32 v6, v9, v8
 ; GFX9-NEXT:    v_fma_f16 v3, -v3, v2, v0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX9-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
-; GFX9-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX9-NEXT:    v_fma_f16 v0, -v5, v2, v0
+; GFX9-NEXT:    v_mad_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v6
+; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
+; GFX9-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX9-NEXT:    v_fma_f16 v0, -v0, v7, v5
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
@@ -2455,33 +2656,66 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v4, s[6:7]
 ; GFX10-NEXT:    global_load_dwordx2 v[2:3], v4, s[0:1] offset:32
+; GFX10-NEXT:    s_waitcnt vmcnt(1)
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v3
-; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX10-NEXT:    v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
+; GFX10-NEXT:    v_mad_f32 v9, -v6, v8, v5
+; GFX10-NEXT:    v_mac_f32_e32 v8, v9, v7
+; GFX10-NEXT:    v_mad_f32 v5, -v6, v8, v5
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v8
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
 ; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
 ; GFX10-NEXT:    v_fma_f16 v5, -v5, v3, v1
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX10-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX10-NEXT:    v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v7, v3
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v8, v7
+; GFX10-NEXT:    v_mul_f32_e32 v9, v6, v8
+; GFX10-NEXT:    v_mad_f32 v10, -v7, v9, v6
+; GFX10-NEXT:    v_mac_f32_e32 v9, v10, v8
+; GFX10-NEXT:    v_mad_f32 v6, -v7, v9, v6
+; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v9
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v6, v6
 ; GFX10-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
 ; GFX10-NEXT:    v_trunc_f16_e32 v6, v6
 ; GFX10-NEXT:    v_fma_f16 v1, -v6, v3, v1
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v3, v0
 ; GFX10-NEXT:    v_pack_b32_f16 v1, v5, v1
-; GFX10-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX10-NEXT:    v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX10-NEXT:    v_mul_f32_e32 v7, v3, v6
+; GFX10-NEXT:    v_mad_f32 v8, -v5, v7, v3
+; GFX10-NEXT:    v_mac_f32_e32 v7, v8, v6
+; GFX10-NEXT:    v_mad_f32 v3, -v5, v7, v3
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v6
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX10-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
 ; GFX10-NEXT:    v_trunc_f16_e32 v3, v3
 ; GFX10-NEXT:    v_fma_f16 v3, -v3, v2, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX10-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX10-NEXT:    v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v6, v2
+; GFX10-NEXT:    v_cvt_f32_f16_e32 v5, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX10-NEXT:    v_mul_f32_e32 v8, v5, v7
+; GFX10-NEXT:    v_mad_f32 v9, -v6, v8, v5
+; GFX10-NEXT:    v_mac_f32_e32 v8, v9, v7
+; GFX10-NEXT:    v_mad_f32 v5, -v6, v8, v5
+; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v8
+; GFX10-NEXT:    v_cvt_f16_f32_e32 v5, v5
 ; GFX10-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
 ; GFX10-NEXT:    v_trunc_f16_e32 v5, v5
 ; GFX10-NEXT:    v_fma_f16 v0, -v5, v2, v0
@@ -2499,50 +2733,97 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
 ; GFX11-NEXT:    global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v9, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX11-NEXT:    v_rcp_f32_e32 v9, v9
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v5, v7, v6
+; GFX11-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v9
 ; GFX11-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
 ; GFX11-NEXT:    v_fma_f16 v5, -v5, v3, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v3
-; GFX11-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fmac_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v9
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v6, v6, v3, v1
-; GFX11-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX11-NEXT:    v_div_fixup_f16 v1, v1, v8, v6
+; GFX11-NEXT:    v_trunc_f16_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_fma_f16 v1, -v6, v3, v1
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT:    v_fma_f16 v1, -v1, v8, v6
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v8, v7
 ; GFX11-NEXT:    v_pack_b32_f16 v1, v5, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
+; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_fmac_f32_e32 v3, v6, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX11-NEXT:    v_add_f32_e32 v3, v5, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_div_fixup_f16 v3, v3, v2, v0
+; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX11-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_fma_f16 v3, -v3, v2, v0
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    v_fmac_f32_e32 v6, v9, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX11-NEXT:    v_rcp_f32_e32 v5, v5
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
-; GFX11-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    v_div_fixup_f16 v0, v0, v7, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_fma_f16 v0, -v5, v2, v0
+; GFX11-NEXT:    v_trunc_f16_e32 v0, v0
+; GFX11-NEXT:    v_fma_f16 v0, -v0, v7, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v3, v0
 ; GFX11-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
@@ -2560,55 +2841,102 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
 ; GFX1150-NEXT:    global_load_b64 v[2:3], v4, s[0:1] offset:32
 ; GFX1150-NEXT:    s_waitcnt vmcnt(1)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v5, v7
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-NEXT:    v_add_f32_e32 v6, v8, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
 ; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fmac_f16_e32 v7, v6, v5
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
-; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT:    v_fmac_f16_e32 v5, v6, v7
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v2, v0
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v6, v7
+; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1150-NEXT:    v_fma_f16 v0, v5, v2, v0
-; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
-; GFX1150-NEXT:    v_pack_b32_f16 v0, v0, v7
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX1150-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-NEXT:    v_fma_f16 v0, v6, v2, v0
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT:    v_pack_b32_f16 v0, v0, v5
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT:    v_rcp_f32_e32 v7, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v2, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v8, v7
+; GFX1150-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1150-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_add_f32_e32 v5, v7, v5
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
 ; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v6, v5, v2
-; GFX1150-NEXT:    v_cvt_f32_f16_e32 v2, v3
-; GFX1150-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX1150-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT:    v_div_fixup_f16 v2, v2, v3, v1
+; GFX1150-NEXT:    v_fmac_f16_e32 v2, v5, v6
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX1150-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1150-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_fmac_f32_e32 v5, v7, v6
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT:    v_mul_f32_e32 v6, v7, v6
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_trunc_f16_e32 v2, v2
-; GFX1150-NEXT:    v_xor_b32_e32 v2, 0x8000, v2
+; GFX1150-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1150-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT:    v_fmac_f16_e32 v1, v2, v3
-; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v6
+; GFX1150-NEXT:    v_fmac_f16_e32 v1, v5, v3
+; GFX1150-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[4:5]
 ; GFX1150-NEXT:    s_nop 0
 ; GFX1150-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)



More information about the llvm-branch-commits mailing list