[llvm-branch-commits] [llvm] [AMDGPU] Adopt new lowering sequence for `fdiv16` (PR #109295)
Shilei Tian via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Sep 30 11:26:38 PDT 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/109295
>From 2eb020eea83b8b806b1e0e05d65a7a79f5bf0cea Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 19 Sep 2024 10:57:27 -0400
Subject: [PATCH] [AMDGPU] Adapt new lowering sequence for `fdiv16`
The current lowering of fdiv16 can generate incorrectly rounded result in some
cases.
Fixes SWDEV-47760.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 36 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 53 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 3095 +++++++++++++----
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 82 +-
.../AMDGPU/GlobalISel/legalize-fdiv.mir | 478 ++-
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 54 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 71 +-
llvm/test/CodeGen/AMDGPU/frem.ll | 670 +++-
8 files changed, 3507 insertions(+), 1032 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 271c8d45fd4a21..53f096cf33b710 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4903,16 +4903,40 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
LLT S16 = LLT::scalar(16);
LLT S32 = LLT::scalar(32);
+ // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
+ // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
+ // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
+ // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+ // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+ // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
+ // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
+ // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
+ // q16.u = opx(V_CVT_F16_F32, q32.u);
+ // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
+
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
-
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto NegRHSExt = B.buildFNeg(S32, RHSExt);
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
.addUse(RHSExt.getReg(0))
.setMIFlags(Flags);
-
- auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
- auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
-
+ auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
+ MachineInstrBuilder Err;
+ if (ST.hasMadMacF32Insts()) {
+ Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
+ } else {
+ Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
+ Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
+ Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
+ }
+ auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
+ Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
+ Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
+ auto RDst = B.buildFPTrunc(S16, Quot, Flags);
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
.addUse(RDst.getReg(0))
.addUse(RHS)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d559d0446b9d8f..ec082d64bf0d22 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10619,19 +10619,48 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
return FastLowered;
SDLoc SL(Op);
- SDValue Src0 = Op.getOperand(0);
- SDValue Src1 = Op.getOperand(1);
-
- SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
- SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
-
- SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
- SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
-
- SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
- SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
- return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
+ // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
+ // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
+ // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
+ // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+ // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
+ // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
+ // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
+ // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
+ // q16.u = opx(V_CVT_F16_F32, q32.u);
+ // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
+
+ // We will use ISD::FMA on targets that don't support ISD::FMAD.
+ unsigned FMADOpCode =
+ isOperationLegal(ISD::FMAD, MVT::f32) ? ISD::FMAD : ISD::FMA;
+
+ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
+ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
+ SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
+ SDValue Rcp =
+ DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
+ SDValue Quot =
+ DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
+ SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
+ Op->getFlags());
+ Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
+ Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
+ Op->getFlags());
+ SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
+ SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
+ TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
+ DAG.getConstant(0xff800000, SL, MVT::i32));
+ Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
+ Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
+ SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
+ DAG.getConstant(0, SL, MVT::i32));
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
+ Op->getFlags());
}
// Faster 2.5 ULP division that does not support denormals.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1a98285230b2cd..5ba036c386a402 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -57,24 +57,59 @@ define half @v_fdiv_f16(half %a, half %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v3
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_fdiv_f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -83,27 +118,71 @@ define half @v_fdiv_f16(half %a, half %b) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_fdiv_f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v3
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv half %a, %b
@@ -188,24 +267,59 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_f16_ulp25:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_f16_ulp25:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_f16_ulp25:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v3
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -214,27 +328,71 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_fdiv_f16_ulp25:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_f16_ulp25:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_f16_ulp25:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v5, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v3
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v5, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v3
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_f16_ulp25:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv half %a, %b
@@ -670,44 +828,113 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX8-NEXT: v_rcp_f32_e32 v5, v5
-; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_v2f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v10, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX8-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_v2f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v2, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v10, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v9, v4
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v8
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v8, v5, v7
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v9, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v8, v5, v7
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_fdiv_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v10, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v8
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v9, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX9-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3
; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -715,33 +942,103 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v7, v7
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v8, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v5, v3
+; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_fdiv_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_rcp_f32_e32 v4, v4
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_v2f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v9, v5
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v8, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v9, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v12, -v3, v10
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v13, -v4, v11
+; GFX10-IEEE-NEXT: v_add_f32_e32 v12, v12, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v13, v13, v9
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v12, v12, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v13, v13, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v12, v10
+; GFX10-IEEE-NEXT: v_add_f32_e32 v11, v13, v11
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v10
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v11
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v7
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v10
+; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v11
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v9, v5
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v7, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v10, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v11, v9, v7
+; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v10, v8
+; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v9
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v10, v12, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v7
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v10, v8
+; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v9
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v7
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v10
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v4, v4, v11
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16:
; GFX11: ; %bb.0:
@@ -749,12 +1046,24 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -897,44 +1206,113 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_v2f16_ulp25:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX8-NEXT: v_rcp_f32_e32 v5, v5
-; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_fdiv_v2f16_ulp25:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v10, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX8-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v2, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v10, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v9, v4
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v8
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v8, v5, v7
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v9, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v8, v5, v7
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v6, v3
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v2, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v10, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v8
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v9, -v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v9, v9, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v9, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX9-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v4, v6, v3
; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -942,33 +1320,103 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v7, v7
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v8, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v4, v0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v7
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v5, v3
+; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_fdiv_v2f16_ulp25:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_rcp_f32_e32 v4, v4
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_v2f16_ulp25:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v8, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v9, v5
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v7, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v8, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v9, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v12, -v3, v10
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v13, -v4, v11
+; GFX10-IEEE-NEXT: v_add_f32_e32 v12, v12, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v13, v13, v9
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v12, v12, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v13, v13, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v12, v10
+; GFX10-IEEE-NEXT: v_add_f32_e32 v11, v13, v11
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v10
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v11
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v7
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v10
+; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v11
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v8, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v9, v5
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v7, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v10, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v11, v9, v7
+; GFX10-FLUSH-NEXT: v_mad_f32 v12, -v3, v10, v8
+; GFX10-FLUSH-NEXT: v_mad_f32 v13, -v4, v11, v9
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v10, v12, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v11, v13, v7
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v10, v8
+; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v4, v11, v9
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v7
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v10
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v4, v4, v11
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_fdiv_v2f16_ulp25:
; GFX11: ; %bb.0:
@@ -976,12 +1424,24 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_and_b32 v4, 0xff800000, v4
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -1061,36 +1521,103 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_rcp_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rcp_v2f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rcp_v2f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_rcp_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1104,43 +1631,122 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_rcp_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rcp_v2f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rcp_v2f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1218,36 +1824,103 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_neg_rcp_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_neg_rcp_v2f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_neg_rcp_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1261,43 +1934,122 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_neg_rcp_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_neg_rcp_v2f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_neg_rcp_v2f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_neg_rcp_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -1385,38 +2137,106 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_rcp_v2f16_fabs:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rcp_v2f16_fabs:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rcp_v2f16_fabs:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_rcp_v2f16_fabs:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1427,50 +2247,131 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16_fabs:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, 1.0
+; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_rcp_v2f16_fabs:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rcp_v2f16_fabs:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rcp_v2f16_fabs:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x.fabs
@@ -1558,38 +2459,106 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_neg_rcp_v2f16_fabs:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1600,50 +2569,131 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, -1.0
+; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_neg_rcp_v2f16_fabs:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x.fabs
@@ -1881,36 +2931,103 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_rcp_v2f16_ulp25:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rcp_v2f16_ulp25:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_rcp_v2f16_ulp25:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1924,43 +3041,122 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_rcp_v2f16_ulp25:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rcp_v2f16_ulp25:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rcp_v2f16_ulp25:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_ulp25:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -2251,24 +3447,60 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
;
-; GFX8-LABEL: s_fdiv_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX8-NEXT: v_rcp_f32_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_div_fixup_f16 v0, v0, v1, s0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-IEEE-LABEL: s_fdiv_f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v2, v0
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v1, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v4, -v0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v4, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v4, v4, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-IEEE-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v0, v1, s0
+; GFX8-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX8-FLUSH-LABEL: s_fdiv_f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v1, v2
+; GFX8-FLUSH-NEXT: v_mad_f32 v4, -v0, v3, v1
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v2
+; GFX8-FLUSH-NEXT: v_mad_f32 v0, -v0, v3, v1
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0
+; GFX8-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-FLUSH-NEXT: ; return to shader part epilog
;
; GFX9-IEEE-LABEL: s_fdiv_f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v1, v2
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v4, -v0, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v4, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v4, v4, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v3
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-IEEE-NEXT: v_mov_b32_e32 v1, s1
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v1, s0
@@ -2278,28 +3510,72 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
; GFX9-FLUSH-LABEL: s_fdiv_f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v1, v3, v0
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v3, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v3, v0
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-FLUSH-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_fdiv_f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX10-NEXT: v_rcp_f32_e32 v0, v0
-; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-IEEE-LABEL: s_fdiv_f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v1, v0
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v2, v1
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v4, -v0, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v1
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX10-FLUSH-LABEL: s_fdiv_f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v1, v0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v2, v1
+; GFX10-FLUSH-NEXT: v_mad_f32 v4, -v0, v3, v2
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v3, v4, v1
+; GFX10-FLUSH-NEXT: v_mad_f32 v0, -v0, v3, v2
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-FLUSH-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fdiv_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0
+; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -2499,42 +3775,113 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
;
-; GFX8-LABEL: s_fdiv_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s3
-; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, s0
-; GFX8-NEXT: v_rcp_f32_e32 v0, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, s2
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, s1
-; GFX8-NEXT: v_div_fixup_f16 v0, v0, v2, s0
-; GFX8-NEXT: v_mov_b32_e32 v2, s3
-; GFX8-NEXT: v_div_fixup_f16 v1, v1, v2, s2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-IEEE-LABEL: s_fdiv_v2f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX8-IEEE-NEXT: s_lshr_b32 s3, s1, 16
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s3
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v2, v0
+; GFX8-IEEE-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, s2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v1, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v0, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-IEEE-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v3, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v5, -v4, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v1
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v5, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-IEEE-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, s0
+; GFX8-IEEE-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v1, v2, s2
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX8-FLUSH-LABEL: s_fdiv_v2f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX8-FLUSH-NEXT: s_lshr_b32 s3, s1, 16
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s3
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
+; GFX8-FLUSH-NEXT: s_lshr_b32 s2, s0, 16
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, s2
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v5, v1, v2
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v0, v5, v1
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
+; GFX8-FLUSH-NEXT: v_mad_f32 v0, -v0, v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v3, v1
+; GFX8-FLUSH-NEXT: v_mad_f32 v5, -v4, v2, v3
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v2, v5, v1
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v4, v2, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-FLUSH-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
+; GFX8-FLUSH-NEXT: v_mov_b32_e32 v2, s3
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s2
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-FLUSH-NEXT: ; return to shader part epilog
;
; GFX9-IEEE-LABEL: s_fdiv_v2f16:
; GFX9-IEEE: ; %bb.0:
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
; GFX9-IEEE-NEXT: s_lshr_b32 s3, s1, 16
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s3
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s3
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v0
; GFX9-IEEE-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, s2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v1, v2
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v0, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-IEEE-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v5, -v4, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v1
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v5, v2
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v4, -v4, v2
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v4, v3
; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s1
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, s0
@@ -2547,36 +3894,106 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX9-FLUSH-LABEL: s_fdiv_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s1, 16
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2
-; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s0, 16
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s1, 16
; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, s3
+; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v1, v4, v0
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v2, v1, s0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s2
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v5, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v4, s2 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s2
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, s3
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, s2
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-FLUSH-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_fdiv_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s2, s1, 16
-; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s2
-; GFX10-NEXT: s_lshr_b32 s3, s0, 16
-; GFX10-NEXT: v_rcp_f32_e32 v0, v0
-; GFX10-NEXT: v_rcp_f32_e32 v1, v1
-; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
-; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-IEEE-LABEL: s_fdiv_v2f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_lshr_b32 s2, s1, 16
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX10-IEEE-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v5, s3
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v0
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v1
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v5, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v8, -v0, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v1, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v8, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v0, -v0, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-IEEE-NEXT: v_add_f32_e32 v0, v0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v1, s2, s3
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX10-FLUSH-LABEL: s_fdiv_v2f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_lshr_b32 s2, s1, 16
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX10-FLUSH-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, s3
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v6, v4, v2
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v5, v3
+; GFX10-FLUSH-NEXT: v_mad_f32 v8, -v0, v6, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v1, v7, v5
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v6, v8, v2
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v3
+; GFX10-FLUSH-NEXT: v_mad_f32 v0, -v0, v6, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v5
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v0, v0, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, s3
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-FLUSH-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_fdiv_v2f16:
; GFX11: ; %bb.0:
@@ -2584,13 +4001,25 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2
; GFX11-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s3
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v1, s3, v1, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX11-NEXT: v_dual_mul_f32 v2, v2, v0 :: v_dual_mul_f32 v3, v3, v1
+; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_fmac_f32 v2, v4, v0 :: v_dual_fmac_f32 v3, v5, v1
+; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v5, -s2, v3, s3 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT: v_dual_add_f32 v1, v1, v3 :: v_dual_and_b32 v0, 0xff800000, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -2896,26 +4325,77 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
;
-; GFX8-LABEL: s_rsq_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_sqrt_f16_e32 v0, s0
-; GFX8-NEXT: s_lshr_b32 s0, s0, 16
-; GFX8-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-IEEE-LABEL: s_rsq_v2f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16
+; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX8-FLUSH-LABEL: s_rsq_v2f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
+; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-FLUSH-NEXT: ; return to shader part epilog
;
; GFX9-IEEE-LABEL: s_rsq_v2f16:
; GFX9-IEEE: ; %bb.0:
@@ -2925,11 +4405,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -2942,50 +4434,125 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX9-FLUSH-NEXT: ; return to shader part epilog
;
-; GFX10-LABEL: s_rsq_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_lshr_b32 s1, s0, 16
-; GFX10-NEXT: v_sqrt_f16_e32 v0, s0
-; GFX10-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-IEEE-LABEL: s_rsq_v2f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX10-FLUSH-LABEL: s_rsq_v2f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-FLUSH-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_rsq_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -3876,25 +5443,75 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_rsq_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_rsq_v2f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_rsq_v2f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_rsq_v2f16:
; GFX9-IEEE: ; %bb.0:
@@ -3904,10 +5521,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -3920,38 +5549,100 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_rsq_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
-; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_rsq_v2f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_rsq_v2f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rsq_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -3959,10 +5650,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -4054,25 +5757,75 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_neg_rsq_v2f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
+; GFX8-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX8-IEEE: ; %bb.0:
+; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX8-IEEE-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX8-FLUSH: ; %bb.0:
+; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX8-FLUSH-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
; GFX9-IEEE: ; %bb.0:
@@ -4082,10 +5835,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -4098,38 +5863,100 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_neg_rsq_v2f16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
-; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX10-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX10-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_neg_rsq_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -4137,10 +5964,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -4154,9 +5993,5 @@ declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10-FLUSH: {{.*}}
-; GFX10-IEEE: {{.*}}
; GFX11-FLUSH: {{.*}}
; GFX11-IEEE: {{.*}}
-; GFX8-FLUSH: {{.*}}
-; GFX8-IEEE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e051cc28469fae..8409e9c88aadaa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -46,8 +46,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: v_mul_f32_e32 v0, v0, v2
+; VI-NEXT: v_rcp_f32_e32 v3, v2
+; VI-NEXT: v_mul_f32_e32 v4, v0, v3
+; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
+; VI-NEXT: v_mac_f32_e32 v4, v5, v3
+; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
+; VI-NEXT: v_mul_f32_e32 v0, v0, v3
+; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v4
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
; VI-NEXT: v_trunc_f16_e32 v0, v0
@@ -554,19 +560,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
; VI-NEXT: s_lshr_b32 s3, s0, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_rcp_f32_e32 v2, v2
; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_mul_f32_e32 v0, v0, v2
+; VI-NEXT: v_rcp_f32_e32 v3, v2
+; VI-NEXT: v_mul_f32_e32 v4, v0, v3
+; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
+; VI-NEXT: v_mac_f32_e32 v4, v5, v3
+; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
+; VI-NEXT: v_mul_f32_e32 v0, v0, v3
+; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v4
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
; VI-NEXT: v_trunc_f16_e32 v0, v0
; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
-; VI-NEXT: v_mul_f32_e32 v1, v1, v3
+; VI-NEXT: v_rcp_f32_e32 v4, v3
+; VI-NEXT: v_mul_f32_e32 v5, v1, v4
+; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
+; VI-NEXT: v_mac_f32_e32 v5, v6, v4
+; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
+; VI-NEXT: v_mul_f32_e32 v1, v1, v4
+; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v5
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
; VI-NEXT: v_trunc_f16_e32 v1, v1
@@ -691,41 +709,65 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
; VI-NEXT: s_lshr_b32 s8, s0, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_rcp_f32_e32 v2, v2
; VI-NEXT: s_lshr_b32 s6, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; VI-NEXT: v_mul_f32_e32 v0, v0, v2
+; VI-NEXT: v_rcp_f32_e32 v3, v2
+; VI-NEXT: s_lshr_b32 s9, s1, 16
+; VI-NEXT: s_lshr_b32 s7, s3, 16
+; VI-NEXT: v_mul_f32_e32 v4, v0, v3
+; VI-NEXT: v_mad_f32 v5, -v2, v4, v0
+; VI-NEXT: v_mac_f32_e32 v4, v5, v3
+; VI-NEXT: v_mad_f32 v0, -v2, v4, v0
+; VI-NEXT: v_mul_f32_e32 v0, v0, v3
+; VI-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v4
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_rcp_f32_e32 v4, v4
-; VI-NEXT: s_lshr_b32 s9, s1, 16
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
; VI-NEXT: v_trunc_f16_e32 v0, v0
; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s7, s3, 16
-; VI-NEXT: v_mul_f32_e32 v1, v1, v3
+; VI-NEXT: v_rcp_f32_e32 v4, v3
+; VI-NEXT: v_mul_f32_e32 v5, v1, v4
+; VI-NEXT: v_mad_f32 v6, -v3, v5, v1
+; VI-NEXT: v_mac_f32_e32 v5, v6, v4
+; VI-NEXT: v_mad_f32 v1, -v3, v5, v1
+; VI-NEXT: v_mul_f32_e32 v1, v1, v4
+; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v5
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_rcp_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
; VI-NEXT: v_trunc_f16_e32 v1, v1
; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_rcp_f32_e32 v5, v4
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_mul_f32_e32 v2, v2, v4
+; VI-NEXT: v_mul_f32_e32 v6, v2, v5
+; VI-NEXT: v_mad_f32 v7, -v4, v6, v2
+; VI-NEXT: v_mac_f32_e32 v6, v7, v5
+; VI-NEXT: v_mad_f32 v2, -v4, v6, v2
+; VI-NEXT: v_mul_f32_e32 v2, v2, v5
+; VI-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; VI-NEXT: v_add_f32_e32 v2, v2, v6
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
; VI-NEXT: v_trunc_f16_e32 v2, v2
; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
-; VI-NEXT: v_mul_f32_e32 v3, v3, v5
+; VI-NEXT: v_rcp_f32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v7, v3, v6
+; VI-NEXT: v_mad_f32 v8, -v5, v7, v3
+; VI-NEXT: v_mac_f32_e32 v7, v8, v6
+; VI-NEXT: v_mad_f32 v3, -v5, v7, v3
+; VI-NEXT: v_mul_f32_e32 v3, v3, v6
+; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; VI-NEXT: v_add_f32_e32 v3, v3, v7
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
; VI-NEXT: v_trunc_f16_e32 v3, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
index e774c2c83dfd8e..1f9c059c2ac60b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
@@ -44,6 +44,7 @@ body: |
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s16
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -53,12 +54,24 @@ body: |
; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C]]
+ ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -68,12 +81,24 @@ body: |
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C]]
+ ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -85,21 +110,6 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]]
; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
- ; GFX10-LABEL: name: test_fdiv_s16
- ; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
- ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
- ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
- ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
- ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
- ; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s16) = G_TRUNC %0
@@ -141,6 +151,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_denorms_on
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -160,6 +171,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_denorms_on
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -179,6 +191,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_on
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -187,6 +200,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_denorms_on
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -246,6 +260,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_denorms_off
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -267,6 +282,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_denorms_off
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -288,6 +304,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -296,6 +313,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_denorms_off
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -357,6 +375,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -378,6 +397,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -399,6 +419,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -407,6 +428,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -473,6 +495,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; VI-LABEL: name: test_fdiv_s64
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -492,6 +515,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-LABEL: name: test_fdiv_s64
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -511,6 +535,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -527,6 +552,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]]
; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+ ;
; GFX10-LABEL: name: test_fdiv_s64
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
@@ -603,6 +629,7 @@ body: |
; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v2s32
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -641,6 +668,7 @@ body: |
; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s32
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -679,6 +707,7 @@ body: |
; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -692,6 +721,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[INT1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX10-LABEL: name: test_fdiv_v2s32
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
@@ -776,6 +806,7 @@ body: |
; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v2s32_flags
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -810,6 +841,7 @@ body: |
; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s32_flags
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -844,6 +876,7 @@ body: |
; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32_flags
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -857,6 +890,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV1]], [[INT1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX10-LABEL: name: test_fdiv_v2s32_flags
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
@@ -949,6 +983,7 @@ body: |
; SI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v3s32
; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; VI-NEXT: {{ $}}
@@ -995,6 +1030,7 @@ body: |
; VI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v3s32
; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; GFX9-NEXT: {{ $}}
@@ -1041,6 +1077,7 @@ body: |
; GFX9-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s32
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1056,6 +1093,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[INT2]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX10-LABEL: name: test_fdiv_v3s32
; GFX10: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; GFX10-NEXT: {{ $}}
@@ -1162,6 +1200,7 @@ body: |
; SI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; VI-LABEL: name: test_fdiv_v2s64
; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; VI-NEXT: {{ $}}
@@ -1196,6 +1235,7 @@ body: |
; VI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s64
; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: {{ $}}
@@ -1230,6 +1270,7 @@ body: |
; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s64
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1258,6 +1299,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; GFX10-LABEL: name: test_fdiv_v2s64
; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; GFX10-NEXT: {{ $}}
@@ -1355,6 +1397,7 @@ body: |
; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ;
; VI-LABEL: name: test_fdiv_v2s16
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -1371,15 +1414,36 @@ body: |
; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+ ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+ ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+ ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+ ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+ ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+ ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+ ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+ ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+ ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+ ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16)
; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16)
@@ -1387,6 +1451,7 @@ body: |
; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -1403,18 +1468,40 @@ body: |
; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+ ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+ ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+ ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+ ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+ ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+ ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+ ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+ ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+ ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+ ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+ ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s16
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1435,34 +1522,6 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
- ; GFX10-LABEL: name: test_fdiv_v2s16
- ; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
- ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
- ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
- ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
- ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
- ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
- ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
%2:_(<2 x s16>) = G_FDIV %0, %1
@@ -1546,6 +1605,7 @@ body: |
; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC2]](s16)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v3s16
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -1568,27 +1628,59 @@ body: |
; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+ ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+ ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+ ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+ ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+ ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+ ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+ ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+ ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+ ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+ ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+ ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+ ; VI-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+ ; VI-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+ ; VI-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+ ; VI-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+ ; VI-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+ ; VI-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+ ; VI-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+ ; VI-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+ ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+ ; VI-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+ ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v3s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -1611,27 +1703,59 @@ body: |
; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+ ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+ ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+ ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+ ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+ ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+ ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+ ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+ ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+ ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+ ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+ ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+ ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+ ; GFX9-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+ ; GFX9-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+ ; GFX9-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+ ; GFX9-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+ ; GFX9-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+ ; GFX9-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+ ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+ ; GFX9-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+ ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s16
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1663,49 +1787,6 @@ body: |
; GFX9-UNSAFE-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL2]](s16)
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; GFX9-UNSAFE-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
- ; GFX10-LABEL: name: test_fdiv_v3s16
- ; GFX10: liveins: $vgpr0, $vgpr1
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
- ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
- ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF1]](<4 x s16>)
- ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
- ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
- ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
- ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
- ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
- ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
- ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
- ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
- ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
- ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
- ; GFX10-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
%1:_(<3 x s16>) = G_IMPLICIT_DEF
%2:_(<3 x s16>) = G_FDIV %0, %1
@@ -1816,6 +1897,7 @@ body: |
; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; VI-LABEL: name: test_fdiv_v4s16
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -1842,27 +1924,68 @@ body: |
; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+ ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+ ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+ ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+ ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+ ; VI-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+ ; VI-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+ ; VI-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+ ; VI-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+ ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+ ; VI-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+ ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
+ ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+ ; VI-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+ ; VI-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+ ; VI-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+ ; VI-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+ ; VI-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+ ; VI-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+ ; VI-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+ ; VI-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+ ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+ ; VI-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+ ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
; VI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; VI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
+ ; VI-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]]
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
- ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
- ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
+ ; VI-NEXT: [[FMUL15:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
+ ; VI-NEXT: [[FMUL16:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FMUL15]]
+ ; VI-NEXT: [[FADD12:%[0-9]+]]:_(s32) = G_FADD [[FMUL16]], [[FPEXT6]]
+ ; VI-NEXT: [[FMUL17:%[0-9]+]]:_(s32) = G_FMUL [[FADD12]], [[INT6]]
+ ; VI-NEXT: [[FADD13:%[0-9]+]]:_(s32) = G_FADD [[FMUL17]], [[FMUL15]]
+ ; VI-NEXT: [[FMUL18:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FADD13]]
+ ; VI-NEXT: [[FADD14:%[0-9]+]]:_(s32) = G_FADD [[FMUL18]], [[FPEXT6]]
+ ; VI-NEXT: [[FMUL19:%[0-9]+]]:_(s32) = G_FMUL [[FADD14]], [[INT6]]
+ ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL19]], [[C1]]
+ ; VI-NEXT: [[FADD15:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FADD13]]
+ ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD15]](s32)
; VI-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16)
; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16)
@@ -1876,6 +1999,7 @@ body: |
; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; GFX9-LABEL: name: test_fdiv_v4s16
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -1902,32 +2026,74 @@ body: |
; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FMUL]]
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[FMUL1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FADD]], [[INT]]
+ ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[FMUL2]], [[FMUL]]
+ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FNEG]], [[FADD1]]
+ ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[FMUL3]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FADD2]], [[INT]]
+ ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -8388608
+ ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[FMUL4]], [[C1]]
+ ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[AND]], [[FADD1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
+ ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
+ ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
+ ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FMUL5]]
+ ; GFX9-NEXT: [[FADD4:%[0-9]+]]:_(s32) = G_FADD [[FMUL6]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FADD4]], [[INT2]]
+ ; GFX9-NEXT: [[FADD5:%[0-9]+]]:_(s32) = G_FADD [[FMUL7]], [[FMUL5]]
+ ; GFX9-NEXT: [[FMUL8:%[0-9]+]]:_(s32) = G_FMUL [[FNEG1]], [[FADD5]]
+ ; GFX9-NEXT: [[FADD6:%[0-9]+]]:_(s32) = G_FADD [[FMUL8]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL9:%[0-9]+]]:_(s32) = G_FMUL [[FADD6]], [[INT2]]
+ ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[FMUL9]], [[C1]]
+ ; GFX9-NEXT: [[FADD7:%[0-9]+]]:_(s32) = G_FADD [[AND1]], [[FADD5]]
+ ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD7]](s32)
; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
+ ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
+ ; GFX9-NEXT: [[FMUL10:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
+ ; GFX9-NEXT: [[FMUL11:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FMUL10]]
+ ; GFX9-NEXT: [[FADD8:%[0-9]+]]:_(s32) = G_FADD [[FMUL11]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMUL12:%[0-9]+]]:_(s32) = G_FMUL [[FADD8]], [[INT4]]
+ ; GFX9-NEXT: [[FADD9:%[0-9]+]]:_(s32) = G_FADD [[FMUL12]], [[FMUL10]]
+ ; GFX9-NEXT: [[FMUL13:%[0-9]+]]:_(s32) = G_FMUL [[FNEG2]], [[FADD9]]
+ ; GFX9-NEXT: [[FADD10:%[0-9]+]]:_(s32) = G_FADD [[FMUL13]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMUL14:%[0-9]+]]:_(s32) = G_FMUL [[FADD10]], [[INT4]]
+ ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[FMUL14]], [[C1]]
+ ; GFX9-NEXT: [[FADD11:%[0-9]+]]:_(s32) = G_FADD [[AND2]], [[FADD9]]
+ ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD11]](s32)
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
; GFX9-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; GFX9-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
+ ; GFX9-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]]
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
- ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
- ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
+ ; GFX9-NEXT: [[FMUL15:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
+ ; GFX9-NEXT: [[FMUL16:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FMUL15]]
+ ; GFX9-NEXT: [[FADD12:%[0-9]+]]:_(s32) = G_FADD [[FMUL16]], [[FPEXT6]]
+ ; GFX9-NEXT: [[FMUL17:%[0-9]+]]:_(s32) = G_FMUL [[FADD12]], [[INT6]]
+ ; GFX9-NEXT: [[FADD13:%[0-9]+]]:_(s32) = G_FADD [[FMUL17]], [[FMUL15]]
+ ; GFX9-NEXT: [[FMUL18:%[0-9]+]]:_(s32) = G_FMUL [[FNEG3]], [[FADD13]]
+ ; GFX9-NEXT: [[FADD14:%[0-9]+]]:_(s32) = G_FADD [[FMUL18]], [[FPEXT6]]
+ ; GFX9-NEXT: [[FMUL19:%[0-9]+]]:_(s32) = G_FMUL [[FADD14]], [[INT6]]
+ ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[FMUL19]], [[C1]]
+ ; GFX9-NEXT: [[FADD15:%[0-9]+]]:_(s32) = G_FADD [[AND3]], [[FADD13]]
+ ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD15]](s32)
; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16)
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v4s16
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1964,58 +2130,6 @@ body: |
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL2]](s16), [[FMUL3]](s16)
; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
- ; GFX10-LABEL: name: test_fdiv_v4s16
- ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
- ; GFX10-NEXT: {{ $}}
- ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr0_vgpr1
- ; GFX10-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
- ; GFX10-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
- ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
- ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
- ; GFX10-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
- ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
- ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(<2 x s16>), [[UV3:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY1]](<4 x s16>)
- ; GFX10-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
- ; GFX10-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
- ; GFX10-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
- ; GFX10-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
- ; GFX10-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
- ; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
- ; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
- ; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
- ; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
- ; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
- ; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
- ; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
- ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
- ; GFX10-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; GFX10-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
- ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
- ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
- ; GFX10-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
- ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16)
- ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
- ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr2_vgpr3
%2:_(<4 x s16>) = G_FDIV %0, %1
@@ -2052,6 +2166,7 @@ body: |
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s16_constant_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2060,6 +2175,7 @@ body: |
; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2068,6 +2184,7 @@ body: |
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2076,6 +2193,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2122,6 +2240,7 @@ body: |
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2131,6 +2250,7 @@ body: |
; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2140,6 +2260,7 @@ body: |
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2149,6 +2270,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2190,6 +2312,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_constant_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2208,6 +2331,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_constant_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2226,12 +2350,14 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_constant_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2281,6 +2407,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2300,6 +2427,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2319,6 +2447,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2326,6 +2455,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2389,6 +2519,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; VI-LABEL: name: test_fdiv_s64_constant_one_rcp
; VI: liveins: $vgpr0_vgpr1
; VI-NEXT: {{ $}}
@@ -2407,6 +2538,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX9: liveins: $vgpr0_vgpr1
; GFX9-NEXT: {{ $}}
@@ -2425,6 +2557,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2440,6 +2573,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+ ;
; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX10: liveins: $vgpr0_vgpr1
; GFX10-NEXT: {{ $}}
@@ -2503,6 +2637,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; VI-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; VI: liveins: $vgpr0_vgpr1
; VI-NEXT: {{ $}}
@@ -2522,6 +2657,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX9: liveins: $vgpr0_vgpr1
; GFX9-NEXT: {{ $}}
@@ -2541,6 +2677,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2557,6 +2694,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+ ;
; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX10: liveins: $vgpr0_vgpr1
; GFX10-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 7c89efd0a713c1..0c6805e3eba598 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -60,15 +60,21 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mov_b32_e32 v6, s5
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v5
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v2
-; GFX8-NEXT: v_rcp_f32_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v0
+; GFX8-NEXT: v_mul_f32_e32 v7, v1, v3
+; GFX8-NEXT: v_mad_f32 v8, -v0, v7, v1
+; GFX8-NEXT: v_mac_f32_e32 v7, v8, v3
+; GFX8-NEXT: v_mad_f32 v0, -v0, v7, v1
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX8-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_div_fixup_f16 v2, v6, v2, v5
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-NEXT: v_div_fixup_f16 v2, v3, v2, v5
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -82,9 +88,17 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mac_f32_e32 v4, v5, v3
+; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
@@ -100,9 +114,17 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_rcp_f32_e32 v4, v3
+; GFX10-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX10-NEXT: v_mad_f32 v7, -v3, v6, v5
+; GFX10-NEXT: v_mac_f32_e32 v6, v7, v4
+; GFX10-NEXT: v_mad_f32 v3, -v3, v6, v5
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
@@ -120,11 +142,23 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v3
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 301299daaa61f4..2eb35977b8160b 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1444,12 +1444,19 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
-; VI-NEXT: s_movk_i32 s4, 0x7000
+; VI-NEXT: s_mov_b32 s4, 0x46000000
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT: v_rcp_f32_e32 v1, v1
-; VI-NEXT: v_mul_f32_e32 v1, 0x46000000, v1
+; VI-NEXT: v_rcp_f32_e32 v2, v1
+; VI-NEXT: v_mul_f32_e32 v3, 0x46000000, v2
+; VI-NEXT: v_mad_f32 v4, -v1, v3, s4
+; VI-NEXT: v_mac_f32_e32 v3, v4, v2
+; VI-NEXT: v_mad_f32 v1, -v1, v3, s4
+; VI-NEXT: v_mul_f32_e32 v1, v1, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT: s_movk_i32 s4, 0x7000
; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1457,12 +1464,18 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, 1
-; GFX10-NEXT: s_mov_b32 s4, 0x46000000
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f32_e32 v1, v1
-; GFX10-NEXT: v_fma_mixlo_f16 v1, v1, s4, 0
+; GFX10-NEXT: v_rcp_f32_e32 v2, v1
+; GFX10-NEXT: v_mul_f32_e32 v3, 0x46000000, v2
+; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 0x46000000
+; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2
+; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 0x46000000
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1478,8 +1491,18 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v1, v1, s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
@@ -1551,8 +1574,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT: v_rcp_f32_e32 v1, v1
-; VI-NEXT: v_add_f32_e32 v1, v1, v1
+; VI-NEXT: v_rcp_f32_e32 v2, v1
+; VI-NEXT: v_add_f32_e32 v3, v2, v2
+; VI-NEXT: v_mad_f32 v4, -v1, v3, 2.0
+; VI-NEXT: v_mac_f32_e32 v3, v4, v2
+; VI-NEXT: v_mad_f32 v1, -v1, v3, 2.0
+; VI-NEXT: v_mul_f32_e32 v1, v1, v2
+; VI-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
; VI-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -1563,8 +1592,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1
; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f32_e32 v1, v1
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_rcp_f32_e32 v2, v1
+; GFX10-NEXT: v_add_f32_e32 v3, v2, v2
+; GFX10-NEXT: v_mad_f32 v4, -v1, v3, 2.0
+; GFX10-NEXT: v_mac_f32_e32 v3, v4, v2
+; GFX10-NEXT: v_mad_f32 v1, -v1, v3, 2.0
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -1573,13 +1608,23 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1
+; GFX11-NEXT: s_mov_b32 s0, 2.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX11-NEXT: v_add_f32_e32 v2, v1, v1
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7c5d73ab66b47a..b3432c457d9a45 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -109,8 +109,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_cvt_f32_f16_e32 v3, v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v5, v2
-; VI-NEXT: v_rcp_f32_e32 v5, v5
-; VI-NEXT: v_mul_f32_e32 v3, v3, v5
+; VI-NEXT: v_rcp_f32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v7, v3, v6
+; VI-NEXT: v_mad_f32 v8, -v5, v7, v3
+; VI-NEXT: v_mac_f32_e32 v7, v8, v6
+; VI-NEXT: v_mad_f32 v3, -v5, v7, v3
+; VI-NEXT: v_mul_f32_e32 v3, v3, v6
+; VI-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; VI-NEXT: v_add_f32_e32 v3, v3, v7
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4
; VI-NEXT: v_trunc_f16_e32 v3, v3
@@ -126,10 +132,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX9-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4
+; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX9-NEXT: v_trunc_f16_e32 v3, v3
; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
@@ -146,10 +161,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT: v_rcp_f32_e32 v5, v4
+; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3
+; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5
+; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
@@ -166,15 +190,28 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX11-NEXT: v_trunc_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
@@ -191,16 +228,29 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v3, v3
-; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX1150-NEXT: s_nop 0
@@ -1974,8 +2024,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT: v_rcp_f32_e32 v7, v7
-; VI-NEXT: v_mul_f32_e32 v5, v5, v7
+; VI-NEXT: v_rcp_f32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v9, v5, v8
+; VI-NEXT: v_mad_f32 v10, -v7, v9, v5
+; VI-NEXT: v_mac_f32_e32 v9, v10, v8
+; VI-NEXT: v_mad_f32 v5, -v7, v9, v5
+; VI-NEXT: v_mul_f32_e32 v5, v5, v8
+; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; VI-NEXT: v_add_f32_e32 v5, v5, v9
; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3
; VI-NEXT: v_trunc_f16_e32 v5, v5
@@ -1983,8 +2039,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
; VI-NEXT: v_cvt_f32_f16_e32 v5, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_rcp_f32_e32 v6, v6
-; VI-NEXT: v_mul_f32_e32 v5, v5, v6
+; VI-NEXT: v_rcp_f32_e32 v7, v6
+; VI-NEXT: v_mul_f32_e32 v8, v5, v7
+; VI-NEXT: v_mad_f32 v9, -v6, v8, v5
+; VI-NEXT: v_mac_f32_e32 v8, v9, v7
+; VI-NEXT: v_mad_f32 v5, -v6, v8, v5
+; VI-NEXT: v_mul_f32_e32 v5, v5, v7
+; VI-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; VI-NEXT: v_add_f32_e32 v5, v5, v8
; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4
; VI-NEXT: v_trunc_f16_e32 v5, v5
@@ -2001,21 +2063,38 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT: v_rcp_f32_e32 v4, v4
+; GFX9-NEXT: v_rcp_f32_e32 v7, v7
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mac_f32_e32 v3, v5, v4
+; GFX9-NEXT: v_mad_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX9-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX9-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v5, v5, v7
; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX9-NEXT: v_mad_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX9-NEXT: v_trunc_f16_e32 v3, v3
+; GFX9-NEXT: v_mac_f32_e32 v5, v8, v7
; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX9-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX9-NEXT: v_trunc_f16_e32 v4, v4
-; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1
+; GFX9-NEXT: v_mad_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v7
+; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_div_fixup_f16 v1, v1, v6, v4
+; GFX9-NEXT: v_trunc_f16_e32 v1, v1
+; GFX9-NEXT: v_fma_f16 v1, -v1, v6, v4
; GFX9-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
@@ -2030,18 +2109,35 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT: v_rcp_f32_e32 v5, v4
+; GFX10-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX10-NEXT: v_mad_f32 v7, -v4, v6, v3
+; GFX10-NEXT: v_mac_f32_e32 v6, v7, v5
+; GFX10-NEXT: v_mad_f32 v3, -v4, v6, v3
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT: v_rcp_f32_e32 v4, v4
-; GFX10-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX10-NEXT: v_rcp_f32_e32 v6, v5
+; GFX10-NEXT: v_mul_f32_e32 v7, v4, v6
+; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v4
+; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6
+; GFX10-NEXT: v_mad_f32 v4, -v5, v7, v4
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX10-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v4, v4
; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1
@@ -2059,28 +2155,52 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v7, v7
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 0xff800000, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v7
; GFX11-NEXT: v_trunc_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX11-NEXT: v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v7
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX11-NEXT: v_trunc_f16_e32 v4, v4
+; GFX11-NEXT: v_div_fixup_f16 v1, v1, v6, v4
+; GFX11-NEXT: v_trunc_f16_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1
+; GFX11-NEXT: v_fma_f16 v1, -v1, v6, v4
; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
@@ -2098,31 +2218,55 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v3, v5
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v5, v4, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v5
+; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX1150-NEXT: s_nop 0
; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2364,8 +2508,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT: v_rcp_f32_e32 v9, v9
-; VI-NEXT: v_mul_f32_e32 v7, v7, v9
+; VI-NEXT: v_rcp_f32_e32 v10, v9
+; VI-NEXT: v_mul_f32_e32 v11, v7, v10
+; VI-NEXT: v_mad_f32 v12, -v9, v11, v7
+; VI-NEXT: v_mac_f32_e32 v11, v12, v10
+; VI-NEXT: v_mad_f32 v7, -v9, v11, v7
+; VI-NEXT: v_mul_f32_e32 v7, v7, v10
+; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; VI-NEXT: v_add_f32_e32 v7, v7, v11
; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6
; VI-NEXT: v_trunc_f16_e32 v7, v7
@@ -2373,8 +2523,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v8, v5
; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_rcp_f32_e32 v8, v8
-; VI-NEXT: v_mul_f32_e32 v7, v7, v8
+; VI-NEXT: v_rcp_f32_e32 v9, v8
+; VI-NEXT: v_mul_f32_e32 v10, v7, v9
+; VI-NEXT: v_mad_f32 v11, -v8, v10, v7
+; VI-NEXT: v_mac_f32_e32 v10, v11, v9
+; VI-NEXT: v_mad_f32 v7, -v8, v10, v7
+; VI-NEXT: v_mul_f32_e32 v7, v7, v9
+; VI-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; VI-NEXT: v_add_f32_e32 v7, v7, v10
; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3
; VI-NEXT: v_trunc_f16_e32 v7, v7
@@ -2384,8 +2540,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-NEXT: v_or_b32_e32 v3, v3, v6
; VI-NEXT: v_cvt_f32_f16_e32 v6, v5
-; VI-NEXT: v_rcp_f32_e32 v8, v8
-; VI-NEXT: v_mul_f32_e32 v6, v6, v8
+; VI-NEXT: v_rcp_f32_e32 v9, v8
+; VI-NEXT: v_mul_f32_e32 v10, v6, v9
+; VI-NEXT: v_mad_f32 v11, -v8, v10, v6
+; VI-NEXT: v_mac_f32_e32 v10, v11, v9
+; VI-NEXT: v_mad_f32 v6, -v8, v10, v6
+; VI-NEXT: v_mul_f32_e32 v6, v6, v9
+; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; VI-NEXT: v_add_f32_e32 v6, v6, v10
; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5
; VI-NEXT: v_trunc_f16_e32 v6, v6
@@ -2393,8 +2555,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v7, v4
; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_rcp_f32_e32 v7, v7
-; VI-NEXT: v_mul_f32_e32 v6, v6, v7
+; VI-NEXT: v_rcp_f32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v9, v6, v8
+; VI-NEXT: v_mad_f32 v10, -v7, v9, v6
+; VI-NEXT: v_mac_f32_e32 v9, v10, v8
+; VI-NEXT: v_mad_f32 v6, -v7, v9, v6
+; VI-NEXT: v_mul_f32_e32 v6, v6, v8
+; VI-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; VI-NEXT: v_add_f32_e32 v6, v6, v9
; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2
; VI-NEXT: v_trunc_f16_e32 v6, v6
@@ -2411,36 +2579,69 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX9-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-NEXT: v_mad_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX9-NEXT: v_cvt_f32_f16_e32 v9, v8
+; GFX9-NEXT: v_rcp_f32_e32 v6, v6
+; GFX9-NEXT: v_rcp_f32_e32 v9, v9
+; GFX9-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mac_f32_e32 v5, v7, v6
+; GFX9-NEXT: v_mad_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX9-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX9-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX9-NEXT: v_mul_f32_e32 v7, v7, v9
; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX9-NEXT: v_mad_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX9-NEXT: v_trunc_f16_e32 v5, v5
+; GFX9-NEXT: v_mac_f32_e32 v7, v10, v9
; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX9-NEXT: v_rcp_f32_e32 v6, v6
-; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1
-; GFX9-NEXT: v_trunc_f16_e32 v6, v6
-; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_mad_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX9-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX9-NEXT: v_div_fixup_f16 v1, v1, v8, v6
+; GFX9-NEXT: v_trunc_f16_e32 v1, v1
+; GFX9-NEXT: v_fma_f16 v1, -v1, v8, v6
; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1
-; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_rcp_f32_e32 v8, v8
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mac_f32_e32 v3, v6, v5
+; GFX9-NEXT: v_mad_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX9-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX9-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX9-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v6, v6, v8
; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX9-NEXT: v_mad_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX9-NEXT: v_trunc_f16_e32 v3, v3
+; GFX9-NEXT: v_mac_f32_e32 v6, v9, v8
; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX9-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-NEXT: v_mad_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0
-; GFX9-NEXT: v_trunc_f16_e32 v5, v5
-; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0
+; GFX9-NEXT: v_mad_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX9-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX9-NEXT: v_add_f32_e32 v0, v0, v6
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_div_fixup_f16 v0, v0, v7, v5
+; GFX9-NEXT: v_trunc_f16_e32 v0, v0
+; GFX9-NEXT: v_fma_f16 v0, -v0, v7, v5
; GFX9-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
@@ -2455,33 +2656,66 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX10-NEXT: v_rcp_f32_e32 v5, v5
-; GFX10-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX10-NEXT: v_rcp_f32_e32 v7, v6
+; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7
+; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5
+; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7
+; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX10-NEXT: v_rcp_f32_e32 v6, v6
-; GFX10-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1
+; GFX10-NEXT: v_rcp_f32_e32 v8, v7
+; GFX10-NEXT: v_mul_f32_e32 v9, v6, v8
+; GFX10-NEXT: v_mad_f32 v10, -v7, v9, v6
+; GFX10-NEXT: v_mac_f32_e32 v9, v10, v8
+; GFX10-NEXT: v_mad_f32 v6, -v7, v9, v6
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX10-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1
; GFX10-NEXT: v_trunc_f16_e32 v6, v6
; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT: v_rcp_f32_e32 v6, v5
+; GFX10-NEXT: v_mul_f32_e32 v7, v3, v6
+; GFX10-NEXT: v_mad_f32 v8, -v5, v7, v3
+; GFX10-NEXT: v_mac_f32_e32 v7, v8, v6
+; GFX10-NEXT: v_mad_f32 v3, -v5, v7, v3
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX10-NEXT: v_rcp_f32_e32 v5, v5
-; GFX10-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX10-NEXT: v_rcp_f32_e32 v7, v6
+; GFX10-NEXT: v_mul_f32_e32 v8, v5, v7
+; GFX10-NEXT: v_mad_f32 v9, -v6, v8, v5
+; GFX10-NEXT: v_mac_f32_e32 v8, v9, v7
+; GFX10-NEXT: v_mad_f32 v5, -v6, v8, v5
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX10-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0
@@ -2499,50 +2733,97 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v6, v6
+; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v5, v5
+; GFX11-NEXT: v_rcp_f32_e32 v9, v9
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX11-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v7, v7, v9
; GFX11-NEXT: v_trunc_f16_e32 v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX11-NEXT: v_rcp_f32_e32 v6, v6
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v7, v10, v9
+; GFX11-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX11-NEXT: v_and_b32_e32 v1, 0xff800000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1
-; GFX11-NEXT: v_trunc_f16_e32 v6, v6
+; GFX11-NEXT: v_div_fixup_f16 v1, v1, v8, v6
+; GFX11-NEXT: v_trunc_f16_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT: v_fma_f16 v1, -v1, v8, v6
+; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v7
; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v8, v8
+; GFX11-NEXT: v_rcp_f32_e32 v5, v5
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v3, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xff800000, v5
+; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX11-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_trunc_f16_e32 v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v9, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX11-NEXT: v_rcp_f32_e32 v5, v5
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0
-; GFX11-NEXT: v_trunc_f16_e32 v5, v5
+; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v7, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0
+; GFX11-NEXT: v_trunc_f16_e32 v0, v0
+; GFX11-NEXT: v_fma_f16 v0, -v0, v7, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
@@ -2560,55 +2841,102 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v5, v7
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_and_b32_e32 v8, 0xff800000, v8
+; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v7, v6, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v0
+; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fma_f16 v0, v5, v2, v0
-; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v6
+; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-NEXT: v_and_b32_e32 v7, 0xff800000, v7
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2
; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v6, v5, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX1150-NEXT: v_rcp_f32_e32 v2, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v2, v2, v3, v1
+; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v2, v2
-; GFX1150-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX1150-NEXT: v_and_b32_e32 v6, 0xff800000, v6
+; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v2, v3
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v6
+; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX1150-NEXT: s_nop 0
; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
More information about the llvm-branch-commits
mailing list