[llvm] [AMDGPU] Adopt new lowering sequence for `fdiv16` (PR #109295)
Shilei Tian via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 20 20:17:33 PDT 2024
https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/109295
>From 2b7d74aa5dd83730bc1057e295456d4267cc9185 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Thu, 19 Sep 2024 10:57:27 -0400
Subject: [PATCH] [AMDGPU] Adapt new lowering sequence for `fdiv16`
The current lowering of fdiv16 can generate incorrectly rounded result in some
cases.
Fixes SWDEV-47760.
---
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 31 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 41 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 1656 ++++++++++-------
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 82 +-
.../AMDGPU/GlobalISel/legalize-fdiv.mir | 522 ++++--
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 54 +-
.../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll | 67 +-
llvm/test/CodeGen/AMDGPU/frem.ll | 670 +++++--
8 files changed, 2152 insertions(+), 971 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e657f668cc656a..a1a42d25ee2f4e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4900,16 +4900,35 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
LLT S16 = LLT::scalar(16);
LLT S32 = LLT::scalar(32);
+ // a32.u = opx(V_CVT_F32_F16, a.u);
+ // b32.u = opx(V_CVT_F32_F16, b.u);
+ // r32.u = opx(V_RCP_F32, b32.u);
+ // q32.u = opx(V_MUL_F32, a32.u, r32.u);
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u);
+ // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u);
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u);
+ // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
+ // tmp.u = opx(V_FREXP_MANT_F32, tmp.u);
+ // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
+ // q16.u = opx(V_CVT_F16_F32, q32.u);
+ // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u);
+
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
-
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ auto NegRHSExt = B.buildFNeg(S32, RHSExt);
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
.addUse(RHSExt.getReg(0))
.setMIFlags(Flags);
-
- auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
- auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
-
+ auto Quot = B.buildFMul(S32, LHSExt, Rcp);
+ auto Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
+ Quot = B.buildFMA(S32, Err, Rcp, Quot);
+ Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt);
+ auto Tmp = B.buildFMul(S32, Err, Rcp);
+ Tmp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {S32})
+ .addUse(Tmp.getReg(0))
+ .setMIFlags(Flags);
+ Quot = B.buildFAdd(S32, Tmp, Quot);
+ auto RDst = B.buildFPTrunc(S16, Quot, Flags);
B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
.addUse(RDst.getReg(0))
.addUse(RHS)
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a9754ba357893f..b1858ac53a7a1d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10606,19 +10606,38 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
return FastLowered;
SDLoc SL(Op);
- SDValue Src0 = Op.getOperand(0);
- SDValue Src1 = Op.getOperand(1);
-
- SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
- SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
-
- SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
- SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
+ SDValue LHS = Op.getOperand(0);
+ SDValue RHS = Op.getOperand(1);
+ // a32.u = opx(V_CVT_F32_F16, a.u);
+ // b32.u = opx(V_CVT_F32_F16, b.u);
+ // r32.u = opx(V_RCP_F32, b32.u);
+ // q32.u = opx(V_MUL_F32, a32.u, r32.u);
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u);
+ // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u);
+ // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u);
+ // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
+ // tmp.u = opx(V_FREXP_MANT_F32, tmp.u);
+ // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
+ // q16.u = opx(V_CVT_F16_F32, q32.u);
+ // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u);
+
+ SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
+ SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
+ SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
+ SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt);
+ SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp);
+ SDValue Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
+ Quot = DAG.getNode(ISD::FMA, SL, MVT::f32, Err, Rcp, Quot);
+ Err = DAG.getNode(ISD::FMA, SL, MVT::f32, NegRHSExt, Quot, LHSExt);
+ SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp);
+ Tmp = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, SL, MVT::i32), Tmp);
+ Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot);
SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
- SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
-
- return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
+ SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
+ return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS);
}
// Faster 2.5 ULP division that does not support denormals.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 89cd18ad9be70b..d8e40f7efd1aed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -57,43 +57,37 @@ define half @v_fdiv_f16(half %a, half %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-IEEE-LABEL: v_fdiv_f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_fdiv_f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX89-LABEL: v_fdiv_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX89-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX89-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v2
+; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -101,9 +95,17 @@ define half @v_fdiv_f16(half %a, half %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv half %a, %b
@@ -188,43 +190,37 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_f16_ulp25:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-IEEE-LABEL: v_fdiv_f16_ulp25:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_fdiv_f16_ulp25:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX89-LABEL: v_fdiv_f16_ulp25:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX89-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX89-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_f16_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
-; GFX10-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v2
+; GFX10-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -232,9 +228,17 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv half %a, %b
@@ -673,59 +677,67 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX8-LABEL: v_fdiv_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX8-NEXT: v_rcp_f32_e32 v5, v5
-; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX8-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX8-NEXT: v_fma_f32 v10, -v2, v9, v4
+; GFX8-NEXT: v_fma_f32 v9, v10, v5, v9
+; GFX8-NEXT: v_fma_f32 v2, -v2, v9, v4
+; GFX8-NEXT: v_rcp_f32_e32 v4, v8
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX8-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX8-NEXT: v_fma_f32 v9, -v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v5, v9, v4, v5
+; GFX8-NEXT: v_fma_f32 v7, -v8, v5, v7
+; GFX8-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX8-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT: v_div_fixup_f16 v1, v4, v6, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_fdiv_v2f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_fdiv_v2f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_fdiv_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX9-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX9-NEXT: v_fma_f32 v10, -v2, v9, v4
+; GFX9-NEXT: v_fma_f32 v9, v10, v5, v9
+; GFX9-NEXT: v_fma_f32 v2, -v2, v9, v4
+; GFX9-NEXT: v_rcp_f32_e32 v4, v8
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX9-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX9-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX9-NEXT: v_fma_f32 v9, -v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v5, v9, v4, v5
+; GFX9-NEXT: v_fma_f32 v7, -v8, v5, v7
+; GFX9-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX9-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT: v_div_fixup_f16 v1, v4, v6, v3
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16:
; GFX10: ; %bb.0:
@@ -733,11 +745,27 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX10-NEXT: v_rcp_f32_e32 v4, v4
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v3
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v4
+; GFX10-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v3
+; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v4
+; GFX10-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX10-NEXT: v_mul_f32_e32 v4, v9, v4
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -749,12 +777,24 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_add_f32 v4, v4, v7
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -900,59 +940,67 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX8-LABEL: v_fdiv_v2f16_ulp25:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX8-NEXT: v_rcp_f32_e32 v5, v5
-; GFX8-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX8-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX8-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX8-NEXT: v_div_fixup_f16 v1, v5, v4, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX8-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX8-NEXT: v_fma_f32 v10, -v2, v9, v4
+; GFX8-NEXT: v_fma_f32 v9, v10, v5, v9
+; GFX8-NEXT: v_fma_f32 v2, -v2, v9, v4
+; GFX8-NEXT: v_rcp_f32_e32 v4, v8
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX8-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX8-NEXT: v_fma_f32 v9, -v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v5, v9, v4, v5
+; GFX8-NEXT: v_fma_f32 v7, -v8, v5, v7
+; GFX8-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX8-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT: v_div_fixup_f16 v1, v4, v6, v3
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_fdiv_v2f16_ulp25:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v7, v5
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v5, v4, v2
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_fdiv_v2f16_ulp25:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, v0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v3, v5
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_fdiv_v2f16_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v8, v6
+; GFX9-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX9-NEXT: v_fma_f32 v10, -v2, v9, v4
+; GFX9-NEXT: v_fma_f32 v9, v10, v5, v9
+; GFX9-NEXT: v_fma_f32 v2, -v2, v9, v4
+; GFX9-NEXT: v_rcp_f32_e32 v4, v8
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX9-NEXT: v_add_f32_e32 v2, v2, v9
+; GFX9-NEXT: v_mul_f32_e32 v5, v7, v4
+; GFX9-NEXT: v_fma_f32 v9, -v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v5, v9, v4, v5
+; GFX9-NEXT: v_fma_f32 v7, -v8, v5, v7
+; GFX9-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX9-NEXT: v_add_f32_e32 v4, v4, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT: v_div_fixup_f16 v1, v4, v6, v3
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_fdiv_v2f16_ulp25:
; GFX10: ; %bb.0:
@@ -960,11 +1008,27 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX10-NEXT: v_rcp_f32_e32 v4, v4
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v3
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v4
+; GFX10-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v6, v8, v3
+; GFX10-NEXT: v_fmac_f32_e32 v7, v9, v4
+; GFX10-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX10-NEXT: v_mul_f32_e32 v4, v9, v4
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -976,12 +1040,24 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v4, v0, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v6, v6, v3 :: v_dual_mul_f32 v7, v7, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_fmac_f32 v6, v8, v3 :: v_dual_fmac_f32 v7, v9, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v9, -v1, v7, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v8, v3 :: v_dual_mul_f32 v4, v9, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_add_f32 v4, v4, v7
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -1064,14 +1140,26 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-LABEL: v_rcp_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX8-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1080,49 +1168,61 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_rcp_v2f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_rcp_v2f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_rcp_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX9-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -1133,14 +1233,28 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_add_f32 v3, v3, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1221,14 +1335,26 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-LABEL: v_neg_rcp_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX8-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1237,49 +1363,61 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_neg_rcp_v2f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_neg_rcp_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX9-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_neg_rcp_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -1290,14 +1428,28 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_add_f32 v3, v3, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -1389,14 +1541,26 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX8-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1405,72 +1569,97 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_rcp_v2f16_fabs:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_rcp_v2f16_fabs:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_rcp_v2f16_fabs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX9-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_fabs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-NEXT: v_rcp_f32_e32 v4, v4
+; GFX10-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX10-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v3
+; GFX10-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX10-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_add_f32 v0, v0, v5
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
+; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x.fabs
@@ -1562,14 +1751,26 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX8-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1578,72 +1779,97 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, -1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX9-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_neg_rcp_v2f16_fabs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-NEXT: v_rcp_f32_e32 v4, v4
+; GFX10-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX10-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v6, v7, v3
+; GFX10-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX10-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v6 :: v_dual_add_f32 v0, v0, v5
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
+; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x.fabs
@@ -1884,14 +2110,26 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-LABEL: v_rcp_v2f16_ulp25:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX8-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX8-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1900,49 +2138,61 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_rcp_v2f16_ulp25:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v4, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v1, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, v2, 1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_rcp_v2f16_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_fma_f32 v8, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-NEXT: v_fma_f32 v1, -v1, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-NEXT: v_fma_f32 v5, v8, v6, v9
+; GFX9-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rcp_v2f16_ulp25:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -1953,14 +2203,28 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_add_f32 v3, v3, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -2251,45 +2515,37 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
;
-; GFX8-LABEL: s_fdiv_f16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX8-NEXT: v_rcp_f32_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: v_mov_b32_e32 v1, s1
-; GFX8-NEXT: v_div_fixup_f16 v0, v0, v1, s0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
-;
-; GFX9-IEEE-LABEL: s_fdiv_f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-IEEE-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v1, s0
-; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-IEEE-NEXT: ; return to shader part epilog
-;
-; GFX9-FLUSH-LABEL: s_fdiv_f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v1, s0
-; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-FLUSH-NEXT: ; return to shader part epilog
+; GFX89-LABEL: s_fdiv_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX89-NEXT: v_rcp_f32_e32 v2, v0
+; GFX89-NEXT: v_mul_f32_e32 v3, v1, v2
+; GFX89-NEXT: v_fma_f32 v4, -v0, v3, v1
+; GFX89-NEXT: v_fma_f32 v3, v4, v2, v3
+; GFX89-NEXT: v_fma_f32 v0, -v0, v3, v1
+; GFX89-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX89-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX89-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_div_fixup_f16 v0, v0, v1, s0
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fdiv_f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s0
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
-; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX10-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v1, v2, v0
+; GFX10-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX10-NEXT: v_readfirstlane_b32 s0, v0
; GFX10-NEXT: ; return to shader part epilog
@@ -2297,9 +2553,17 @@ define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
; GFX11-LABEL: s_fdiv_f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v0
+; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v1, v2, v0
+; GFX11-NEXT: v_fma_mix_f32 v2, -s1, v1, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -2502,16 +2766,28 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX8-LABEL: s_fdiv_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0
; GFX8-NEXT: s_lshr_b32 s3, s1, 16
-; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s3
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, s3
+; GFX8-NEXT: v_rcp_f32_e32 v2, v0
; GFX8-NEXT: s_lshr_b32 s2, s0, 16
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, s0
-; GFX8-NEXT: v_rcp_f32_e32 v0, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, s2
-; GFX8-NEXT: v_rcp_f32_e32 v1, v1
-; GFX8-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v5, v1, v2
+; GFX8-NEXT: v_fma_f32 v6, -v0, v5, v1
+; GFX8-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX8-NEXT: v_fma_f32 v0, -v0, v5, v1
+; GFX8-NEXT: v_rcp_f32_e32 v1, v4
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX8-NEXT: v_mul_f32_e32 v2, v3, v1
+; GFX8-NEXT: v_fma_f32 v5, -v4, v2, v3
+; GFX8-NEXT: v_fma_f32 v2, v5, v1, v2
+; GFX8-NEXT: v_fma_f32 v3, -v4, v2, v3
; GFX8-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX8-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v2
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_div_fixup_f16 v0, v0, v2, s0
@@ -2522,45 +2798,39 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-IEEE-LABEL: s_fdiv_v2f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX9-IEEE-NEXT: s_lshr_b32 s3, s1, 16
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s3
-; GFX9-IEEE-NEXT: s_lshr_b32 s2, s0, 16
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, s0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, s2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-IEEE-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v1, v2, s2
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-IEEE-NEXT: ; return to shader part epilog
-;
-; GFX9-FLUSH-LABEL: s_fdiv_v2f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s1
-; GFX9-FLUSH-NEXT: s_lshr_b32 s2, s1, 16
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s2
-; GFX9-FLUSH-NEXT: s_lshr_b32 s3, s0, 16
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v0, v2, s0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v2, s3
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v1, s2, v2
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-FLUSH-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_fdiv_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX9-NEXT: s_lshr_b32 s3, s1, 16
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, s3
+; GFX9-NEXT: v_rcp_f32_e32 v2, v0
+; GFX9-NEXT: s_lshr_b32 s2, s0, 16
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, s2
+; GFX9-NEXT: v_mul_f32_e32 v5, v1, v2
+; GFX9-NEXT: v_fma_f32 v6, -v0, v5, v1
+; GFX9-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX9-NEXT: v_fma_f32 v0, -v0, v5, v1
+; GFX9-NEXT: v_rcp_f32_e32 v1, v4
+; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX9-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX9-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-NEXT: v_mul_f32_e32 v2, v3, v1
+; GFX9-NEXT: v_fma_f32 v5, -v4, v2, v3
+; GFX9-NEXT: v_fma_f32 v2, v5, v1, v2
+; GFX9-NEXT: v_fma_f32 v3, -v4, v2, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX9-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_div_fixup_f16 v0, v0, v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s3
+; GFX9-NEXT: v_div_fixup_f16 v1, v1, v2, s2
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_fdiv_v2f16:
; GFX10: ; %bb.0:
@@ -2568,10 +2838,26 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s2
; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, s3
; GFX10-NEXT: v_rcp_f32_e32 v0, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
-; GFX10-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v2, v2, v0
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v1
+; GFX10-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v5, -s1, v3, s0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v2, v4, v0
+; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v1
+; GFX10-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v5, -s1, v3, s0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v0, v4, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, v5, v1
+; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2584,11 +2870,23 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2
; GFX11-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s3
; GFX11-NEXT: v_rcp_f32_e32 v0, v0
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v0, s0, v0, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v1, s0, v1, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v2, v0 :: v_dual_mul_f32 v3, v3, v1
+; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v5, -s1, v3, s0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_fmac_f32 v2, v4, v0 :: v_dual_fmac_f32 v3, v5, v1
+; GFX11-NEXT: v_fma_mix_f32 v4, -s1, v2, s0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v5, -s1, v3, s0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v0, v4, v0 :: v_dual_mul_f32 v1, v5, v1
+; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2904,67 +3202,89 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_fma_f32 v8, -v2, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX8-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX8-NEXT: v_fma_f32 v8, v8, v6, v9
+; GFX8-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX8-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX8-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: v_readfirstlane_b32 s0, v0
; GFX8-NEXT: ; return to shader part epilog
;
-; GFX9-IEEE-LABEL: s_rsq_v2f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
-; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16
-; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-IEEE-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-IEEE-NEXT: ; return to shader part epilog
-;
-; GFX9-FLUSH-LABEL: s_rsq_v2f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
-; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
-; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX9-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
-; GFX9-FLUSH-NEXT: ; return to shader part epilog
+; GFX9-LABEL: s_rsq_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_fma_f32 v8, -v2, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-NEXT: v_fma_f32 v8, v8, v6, v9
+; GFX9-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
@@ -2976,16 +3296,30 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_add_f32 v3, v3, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -3884,10 +4218,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX8-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX8-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX8-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX8-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX8-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -3896,52 +4242,63 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_rsq_v2f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_rsq_v2f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_rsq_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX9-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX9-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX9-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX9-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
@@ -3952,6 +4309,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -3959,10 +4317,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_add_f32 v3, v3, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -4062,10 +4433,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_rcp_f32_e32 v3, v3
-; GFX8-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX8-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX8-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX8-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX8-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX8-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX8-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX8-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -4074,52 +4457,63 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
-; GFX9-IEEE: ; %bb.0:
-; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
-; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
-; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16:
-; GFX9-FLUSH: ; %bb.0:
-; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
-; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
-; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
-; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
-; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX9-LABEL: v_neg_rsq_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX9-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX9-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX9-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX9-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX9-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_neg_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
@@ -4130,6 +4524,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -4137,10 +4532,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_add_f32 v3, v3, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -4160,3 +4568,5 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
; GFX11-IEEE: {{.*}}
; GFX8-FLUSH: {{.*}}
; GFX8-IEEE: {{.*}}
+; GFX9-FLUSH: {{.*}}
+; GFX9-IEEE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index e051cc28469fae..194eb8cd17f43c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -46,8 +46,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_rcp_f32_e32 v2, v2
-; VI-NEXT: v_mul_f32_e32 v0, v0, v2
+; VI-NEXT: v_rcp_f32_e32 v3, v2
+; VI-NEXT: v_mul_f32_e32 v4, v0, v3
+; VI-NEXT: v_fma_f32 v5, -v2, v4, v0
+; VI-NEXT: v_fma_f32 v4, v5, v3, v4
+; VI-NEXT: v_fma_f32 v0, -v2, v4, v0
+; VI-NEXT: v_mul_f32_e32 v0, v0, v3
+; VI-NEXT: v_frexp_mant_f32_e32 v0, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v4
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
; VI-NEXT: v_trunc_f16_e32 v0, v0
@@ -554,19 +560,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
; VI-NEXT: s_lshr_b32 s3, s0, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_rcp_f32_e32 v2, v2
; VI-NEXT: s_lshr_b32 s1, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_mul_f32_e32 v0, v0, v2
+; VI-NEXT: v_rcp_f32_e32 v3, v2
+; VI-NEXT: v_mul_f32_e32 v4, v0, v3
+; VI-NEXT: v_fma_f32 v5, -v2, v4, v0
+; VI-NEXT: v_fma_f32 v4, v5, v3, v4
+; VI-NEXT: v_fma_f32 v0, -v2, v4, v0
+; VI-NEXT: v_mul_f32_e32 v0, v0, v3
+; VI-NEXT: v_frexp_mant_f32_e32 v0, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v4
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: v_cvt_f32_f16_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
; VI-NEXT: v_trunc_f16_e32 v0, v0
; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
; VI-NEXT: v_cvt_f32_f16_e32 v1, s1
-; VI-NEXT: v_mul_f32_e32 v1, v1, v3
+; VI-NEXT: v_rcp_f32_e32 v4, v3
+; VI-NEXT: v_mul_f32_e32 v5, v1, v4
+; VI-NEXT: v_fma_f32 v6, -v3, v5, v1
+; VI-NEXT: v_fma_f32 v5, v6, v4, v5
+; VI-NEXT: v_fma_f32 v1, -v3, v5, v1
+; VI-NEXT: v_mul_f32_e32 v1, v1, v4
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v5
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s1
; VI-NEXT: v_trunc_f16_e32 v1, v1
@@ -691,41 +709,65 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
; VI-NEXT: v_cvt_f32_f16_e32 v2, s0
; VI-NEXT: s_lshr_b32 s8, s0, 16
-; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
; VI-NEXT: v_mov_b32_e32 v1, s0
-; VI-NEXT: v_rcp_f32_e32 v2, v2
; VI-NEXT: s_lshr_b32 s6, s2, 16
-; VI-NEXT: v_rcp_f32_e32 v3, v3
-; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; VI-NEXT: v_mul_f32_e32 v0, v0, v2
+; VI-NEXT: v_rcp_f32_e32 v3, v2
+; VI-NEXT: s_lshr_b32 s9, s1, 16
+; VI-NEXT: s_lshr_b32 s7, s3, 16
+; VI-NEXT: v_mul_f32_e32 v4, v0, v3
+; VI-NEXT: v_fma_f32 v5, -v2, v4, v0
+; VI-NEXT: v_fma_f32 v4, v5, v3, v4
+; VI-NEXT: v_fma_f32 v0, -v2, v4, v0
+; VI-NEXT: v_mul_f32_e32 v0, v0, v3
+; VI-NEXT: v_frexp_mant_f32_e32 v0, v0
+; VI-NEXT: v_add_f32_e32 v0, v0, v4
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: v_cvt_f32_f16_e32 v3, s8
; VI-NEXT: v_mov_b32_e32 v2, s8
-; VI-NEXT: v_rcp_f32_e32 v4, v4
-; VI-NEXT: s_lshr_b32 s9, s1, 16
; VI-NEXT: v_div_fixup_f16 v0, v0, v1, s2
; VI-NEXT: v_trunc_f16_e32 v0, v0
; VI-NEXT: v_fma_f16 v0, -v0, v1, s2
; VI-NEXT: v_cvt_f32_f16_e32 v1, s6
-; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
-; VI-NEXT: s_lshr_b32 s7, s3, 16
-; VI-NEXT: v_mul_f32_e32 v1, v1, v3
+; VI-NEXT: v_rcp_f32_e32 v4, v3
+; VI-NEXT: v_mul_f32_e32 v5, v1, v4
+; VI-NEXT: v_fma_f32 v6, -v3, v5, v1
+; VI-NEXT: v_fma_f32 v5, v6, v4, v5
+; VI-NEXT: v_fma_f32 v1, -v3, v5, v1
+; VI-NEXT: v_mul_f32_e32 v1, v1, v4
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v5
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT: v_cvt_f32_f16_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v3, s1
-; VI-NEXT: v_rcp_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v1, v1, v2, s6
; VI-NEXT: v_trunc_f16_e32 v1, v1
; VI-NEXT: v_fma_f16 v1, -v1, v2, s6
; VI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; VI-NEXT: v_rcp_f32_e32 v5, v4
; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: v_mul_f32_e32 v2, v2, v4
+; VI-NEXT: v_mul_f32_e32 v6, v2, v5
+; VI-NEXT: v_fma_f32 v7, -v4, v6, v2
+; VI-NEXT: v_fma_f32 v6, v7, v5, v6
+; VI-NEXT: v_fma_f32 v2, -v4, v6, v2
+; VI-NEXT: v_mul_f32_e32 v2, v2, v5
+; VI-NEXT: v_frexp_mant_f32_e32 v2, v2
+; VI-NEXT: v_add_f32_e32 v2, v2, v6
; VI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; VI-NEXT: v_cvt_f32_f16_e32 v5, s9
; VI-NEXT: v_mov_b32_e32 v4, s9
; VI-NEXT: v_div_fixup_f16 v2, v2, v3, s3
; VI-NEXT: v_trunc_f16_e32 v2, v2
; VI-NEXT: v_fma_f16 v2, -v2, v3, s3
; VI-NEXT: v_cvt_f32_f16_e32 v3, s7
-; VI-NEXT: v_mul_f32_e32 v3, v3, v5
+; VI-NEXT: v_rcp_f32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v7, v3, v6
+; VI-NEXT: v_fma_f32 v8, -v5, v7, v3
+; VI-NEXT: v_fma_f32 v7, v8, v6, v7
+; VI-NEXT: v_fma_f32 v3, -v5, v7, v3
+; VI-NEXT: v_mul_f32_e32 v3, v3, v6
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v3
+; VI-NEXT: v_add_f32_e32 v3, v3, v7
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
; VI-NEXT: v_div_fixup_f16 v3, v3, v4, s7
; VI-NEXT: v_trunc_f16_e32 v3, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
index e774c2c83dfd8e..b5956c73ae0171 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
@@ -44,6 +44,7 @@ body: |
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s16
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -53,12 +54,20 @@ body: |
; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
- ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
+ ; VI-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; VI-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; VI-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; VI-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; VI-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
+ ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16)
; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -68,12 +77,20 @@ body: |
; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX9-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -85,6 +102,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[INT]]
; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s16
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -94,11 +112,18 @@ body: |
; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
+ ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
- ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX10-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC1]](s16), [[TRUNC]](s16)
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16)
; GFX10-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -141,6 +166,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_denorms_on
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -160,6 +186,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_denorms_on
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -179,6 +206,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_on
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -187,6 +215,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_denorms_on
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -246,6 +275,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_denorms_off
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -267,6 +297,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_denorms_off
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -288,6 +319,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -296,6 +328,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[INT]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_denorms_off
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -357,6 +390,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -378,6 +412,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -399,6 +434,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -407,6 +443,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
; GFX9-UNSAFE-NEXT: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -473,6 +510,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; VI-LABEL: name: test_fdiv_s64
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -492,6 +530,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-LABEL: name: test_fdiv_s64
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -511,6 +550,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY1]](s64), [[COPY]](s64)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -527,6 +567,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[COPY]]
; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+ ;
; GFX10-LABEL: name: test_fdiv_s64
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
@@ -603,6 +644,7 @@ body: |
; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v2s32
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -641,6 +683,7 @@ body: |
; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s32
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -679,6 +722,7 @@ body: |
; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -692,6 +736,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[INT1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX10-LABEL: name: test_fdiv_v2s32
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
@@ -776,6 +821,7 @@ body: |
; SI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v2s32_flags
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -810,6 +856,7 @@ body: |
; VI-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s32_flags
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -844,6 +891,7 @@ body: |
; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s32) = nnan G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s32), [[UV3]](s32), [[UV1]](s32)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s32_flags
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -857,6 +905,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV1]], [[INT1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ;
; GFX10-LABEL: name: test_fdiv_v2s32_flags
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
@@ -949,6 +998,7 @@ body: |
; SI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v3s32
; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; VI-NEXT: {{ $}}
@@ -995,6 +1045,7 @@ body: |
; VI-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v3s32
; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; GFX9-NEXT: {{ $}}
@@ -1041,6 +1092,7 @@ body: |
; GFX9-NEXT: [[INT20:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT19]](s32), [[UV5]](s32), [[UV2]](s32)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[INT6]](s32), [[INT13]](s32), [[INT20]](s32)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s32
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1056,6 +1108,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[INT2]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX10-LABEL: name: test_fdiv_v3s32
; GFX10: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
; GFX10-NEXT: {{ $}}
@@ -1162,6 +1215,7 @@ body: |
; SI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; VI-LABEL: name: test_fdiv_v2s64
; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; VI-NEXT: {{ $}}
@@ -1196,6 +1250,7 @@ body: |
; VI-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s64
; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-NEXT: {{ $}}
@@ -1230,6 +1285,7 @@ body: |
; GFX9-NEXT: [[INT13:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT12]](s64), [[UV3]](s64), [[UV1]](s64)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[INT6]](s64), [[INT13]](s64)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s64
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1258,6 +1314,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA11:%[0-9]+]]:_(s64) = G_FMA [[FMA10]], [[FMA9]], [[FMUL1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA5]](s64), [[FMA11]](s64)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ;
; GFX10-LABEL: name: test_fdiv_v2s64
; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
; GFX10-NEXT: {{ $}}
@@ -1355,6 +1412,7 @@ body: |
; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ;
; VI-LABEL: name: test_fdiv_v2s16
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -1371,22 +1429,37 @@ body: |
; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
+ ; VI-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; VI-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; VI-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; VI-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; VI-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
- ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16)
- ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16)
+ ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; VI-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; VI-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; VI-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; VI-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
+ ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT2]](s16)
+ ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT5]](s16)
; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ;
; GFX9-LABEL: name: test_fdiv_v2s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -1403,18 +1476,33 @@ body: |
; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX9-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
+ ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; GFX9-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; GFX9-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT2]](s16), [[INT5]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v2s16
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1435,6 +1523,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMUL1:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC1]], [[INT1]]
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL]](s16), [[FMUL1]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+ ;
; GFX10-LABEL: name: test_fdiv_v2s16
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -1451,17 +1540,31 @@ body: |
; GFX10-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
+ ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX10-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC2]](s16), [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
- ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
+ ; GFX10-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; GFX10-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; GFX10-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; GFX10-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC3]](s16), [[TRUNC1]](s16)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT2]](s16), [[INT5]](s16)
; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
@@ -1546,6 +1649,7 @@ body: |
; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC2]](s16)
; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
; VI-LABEL: name: test_fdiv_v3s16
; VI: liveins: $vgpr0, $vgpr1
; VI-NEXT: {{ $}}
@@ -1568,27 +1672,49 @@ body: |
; VI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
+ ; VI-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; VI-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; VI-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; VI-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; VI-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
+ ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; VI-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; VI-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; VI-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; VI-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
- ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
- ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
- ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
+ ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
+ ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
+ ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT6]]
+ ; VI-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]]
+ ; VI-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT6]], [[FMUL4]]
+ ; VI-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]]
+ ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT6]]
+ ; VI-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL5]](s32)
+ ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[FMA7]]
+ ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32)
+ ; VI-NEXT: [[INT8:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
+ ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16)
+ ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
+ ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT8]](s16)
; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-LABEL: name: test_fdiv_v3s16
; GFX9: liveins: $vgpr0, $vgpr1
; GFX9-NEXT: {{ $}}
@@ -1611,27 +1737,49 @@ body: |
; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX9-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; GFX9-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; GFX9-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
- ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
- ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
- ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
+ ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
+ ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
+ ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT6]]
+ ; GFX9-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT6]], [[FMUL4]]
+ ; GFX9-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT6]]
+ ; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL5]](s32)
+ ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[FMA7]]
+ ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32)
+ ; GFX9-NEXT: [[INT8:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
+ ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16)
+ ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
+ ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT8]](s16)
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v3s16
; GFX9-UNSAFE: liveins: $vgpr0, $vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1663,6 +1811,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL2]](s16)
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; GFX9-UNSAFE-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+ ;
; GFX10-LABEL: name: test_fdiv_v3s16
; GFX10: liveins: $vgpr0, $vgpr1
; GFX10-NEXT: {{ $}}
@@ -1685,25 +1834,46 @@ body: |
; GFX10-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST3]](s32)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
+ ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX10-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC3]](s16), [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
- ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
+ ; GFX10-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; GFX10-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; GFX10-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; GFX10-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC4]](s16), [[TRUNC1]](s16)
; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
- ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
- ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT3]](s16)
- ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
+ ; GFX10-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
+ ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
+ ; GFX10-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT6]]
+ ; GFX10-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]]
+ ; GFX10-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT6]], [[FMUL4]]
+ ; GFX10-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]]
+ ; GFX10-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT6]]
+ ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL5]](s32)
+ ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[FMA7]]
+ ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32)
+ ; GFX10-NEXT: [[INT8:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC5]](s16), [[TRUNC2]](s16)
+ ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT2]](s16)
+ ; GFX10-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[INT5]](s16)
+ ; GFX10-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[INT8]](s16)
; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
; GFX10-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
%0:_(<3 x s16>) = G_IMPLICIT_DEF
@@ -1816,6 +1986,7 @@ body: |
; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; VI-LABEL: name: test_fdiv_v4s16
; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; VI-NEXT: {{ $}}
@@ -1842,40 +2013,69 @@ body: |
; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
; VI-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; VI-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+ ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; VI-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; VI-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
+ ; VI-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; VI-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; VI-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; VI-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; VI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; VI-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
; VI-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; VI-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; VI-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
+ ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; VI-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; VI-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; VI-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; VI-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; VI-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
; VI-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; VI-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
- ; VI-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; VI-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
+ ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
+ ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
+ ; VI-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT6]]
+ ; VI-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]]
+ ; VI-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT6]], [[FMUL4]]
+ ; VI-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]]
+ ; VI-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT6]]
+ ; VI-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL5]](s32)
+ ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[FMA7]]
+ ; VI-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32)
+ ; VI-NEXT: [[INT8:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
; VI-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; VI-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
- ; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
- ; VI-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
- ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
- ; VI-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
- ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT1]](s16)
- ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT3]](s16)
+ ; VI-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]]
+ ; VI-NEXT: [[INT9:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
+ ; VI-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT9]]
+ ; VI-NEXT: [[FMA9:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMUL6]], [[FPEXT6]]
+ ; VI-NEXT: [[FMA10:%[0-9]+]]:_(s32) = G_FMA [[FMA9]], [[INT9]], [[FMUL6]]
+ ; VI-NEXT: [[FMA11:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMA10]], [[FPEXT6]]
+ ; VI-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FMA11]], [[INT9]]
+ ; VI-NEXT: [[INT10:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL7]](s32)
+ ; VI-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[INT10]], [[FMA10]]
+ ; VI-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
+ ; VI-NEXT: [[INT11:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
+ ; VI-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[INT2]](s16)
+ ; VI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[INT5]](s16)
; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
- ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[INT5]](s16)
- ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[INT7]](s16)
+ ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[INT8]](s16)
+ ; VI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[INT11]](s16)
; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; GFX9-LABEL: name: test_fdiv_v4s16
; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-NEXT: {{ $}}
@@ -1902,32 +2102,61 @@ body: |
; GFX9-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
; GFX9-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+ ; GFX9-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
+ ; GFX9-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX9-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX9-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX9-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX9-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
; GFX9-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; GFX9-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
+ ; GFX9-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; GFX9-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; GFX9-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; GFX9-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; GFX9-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; GFX9-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
; GFX9-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX9-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
- ; GFX9-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX9-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
+ ; GFX9-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
+ ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
+ ; GFX9-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT6]]
+ ; GFX9-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT6]], [[FMUL4]]
+ ; GFX9-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]]
+ ; GFX9-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT6]]
+ ; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL5]](s32)
+ ; GFX9-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[FMA7]]
+ ; GFX9-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32)
+ ; GFX9-NEXT: [[INT8:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
; GFX9-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; GFX9-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
- ; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
- ; GFX9-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
- ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
- ; GFX9-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
- ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16)
+ ; GFX9-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]]
+ ; GFX9-NEXT: [[INT9:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
+ ; GFX9-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT9]]
+ ; GFX9-NEXT: [[FMA9:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMUL6]], [[FPEXT6]]
+ ; GFX9-NEXT: [[FMA10:%[0-9]+]]:_(s32) = G_FMA [[FMA9]], [[INT9]], [[FMUL6]]
+ ; GFX9-NEXT: [[FMA11:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMA10]], [[FPEXT6]]
+ ; GFX9-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FMA11]], [[INT9]]
+ ; GFX9-NEXT: [[INT10:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL7]](s32)
+ ; GFX9-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[INT10]], [[FMA10]]
+ ; GFX9-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
+ ; GFX9-NEXT: [[INT11:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT2]](s16), [[INT5]](s16)
+ ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT8]](s16), [[INT11]](s16)
; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_v4s16
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -1964,6 +2193,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[FMUL2]](s16), [[FMUL3]](s16)
; GFX9-UNSAFE-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ;
; GFX10-LABEL: name: test_fdiv_v4s16
; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX10-NEXT: {{ $}}
@@ -1990,30 +2220,58 @@ body: |
; GFX10-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
; GFX10-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC4]](s16)
+ ; GFX10-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT1]]
; GFX10-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
; GFX10-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
- ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
- ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
+ ; GFX10-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMUL]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[FMA]], [[INT]], [[FMUL]]
+ ; GFX10-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[FNEG]], [[FMA1]], [[FPEXT]]
+ ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FMA2]], [[INT]]
+ ; GFX10-NEXT: [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL1]](s32)
+ ; GFX10-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[INT1]], [[FMA1]]
+ ; GFX10-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
+ ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC4]](s16), [[TRUNC]](s16)
; GFX10-NEXT: [[FPEXT2:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC1]](s16)
; GFX10-NEXT: [[FPEXT3:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC5]](s16)
- ; GFX10-NEXT: [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
- ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT2]]
- ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL1]](s32)
- ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
+ ; GFX10-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT3]]
+ ; GFX10-NEXT: [[INT3:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT3]](s32)
+ ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT2]], [[INT3]]
+ ; GFX10-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMUL2]], [[FPEXT2]]
+ ; GFX10-NEXT: [[FMA4:%[0-9]+]]:_(s32) = G_FMA [[FMA3]], [[INT3]], [[FMUL2]]
+ ; GFX10-NEXT: [[FMA5:%[0-9]+]]:_(s32) = G_FMA [[FNEG1]], [[FMA4]], [[FPEXT2]]
+ ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FMA5]], [[INT3]]
+ ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL3]](s32)
+ ; GFX10-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[INT4]], [[FMA4]]
+ ; GFX10-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD1]](s32)
+ ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC1]](s16), [[TRUNC5]](s16), [[TRUNC1]](s16)
; GFX10-NEXT: [[FPEXT4:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC2]](s16)
; GFX10-NEXT: [[FPEXT5:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC6]](s16)
- ; GFX10-NEXT: [[INT4:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
- ; GFX10-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT4]]
- ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL2]](s32)
- ; GFX10-NEXT: [[INT5:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
+ ; GFX10-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT5]]
+ ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT5]](s32)
+ ; GFX10-NEXT: [[FMUL4:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT4]], [[INT6]]
+ ; GFX10-NEXT: [[FMA6:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMUL4]], [[FPEXT4]]
+ ; GFX10-NEXT: [[FMA7:%[0-9]+]]:_(s32) = G_FMA [[FMA6]], [[INT6]], [[FMUL4]]
+ ; GFX10-NEXT: [[FMA8:%[0-9]+]]:_(s32) = G_FMA [[FNEG2]], [[FMA7]], [[FPEXT4]]
+ ; GFX10-NEXT: [[FMUL5:%[0-9]+]]:_(s32) = G_FMUL [[FMA8]], [[INT6]]
+ ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL5]](s32)
+ ; GFX10-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[INT7]], [[FMA7]]
+ ; GFX10-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD2]](s32)
+ ; GFX10-NEXT: [[INT8:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC2]](s16), [[TRUNC6]](s16), [[TRUNC2]](s16)
; GFX10-NEXT: [[FPEXT6:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC3]](s16)
; GFX10-NEXT: [[FPEXT7:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC7]](s16)
- ; GFX10-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
- ; GFX10-NEXT: [[FMUL3:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT6]]
- ; GFX10-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL3]](s32)
- ; GFX10-NEXT: [[INT7:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT1]](s16), [[INT3]](s16)
- ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT5]](s16), [[INT7]](s16)
+ ; GFX10-NEXT: [[FNEG3:%[0-9]+]]:_(s32) = G_FNEG [[FPEXT7]]
+ ; GFX10-NEXT: [[INT9:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT7]](s32)
+ ; GFX10-NEXT: [[FMUL6:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT6]], [[INT9]]
+ ; GFX10-NEXT: [[FMA9:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMUL6]], [[FPEXT6]]
+ ; GFX10-NEXT: [[FMA10:%[0-9]+]]:_(s32) = G_FMA [[FMA9]], [[INT9]], [[FMUL6]]
+ ; GFX10-NEXT: [[FMA11:%[0-9]+]]:_(s32) = G_FMA [[FNEG3]], [[FMA10]], [[FPEXT6]]
+ ; GFX10-NEXT: [[FMUL7:%[0-9]+]]:_(s32) = G_FMUL [[FMA11]], [[INT9]]
+ ; GFX10-NEXT: [[INT10:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.frexp.mant), [[FMUL7]](s32)
+ ; GFX10-NEXT: [[FADD3:%[0-9]+]]:_(s32) = G_FADD [[INT10]], [[FMA10]]
+ ; GFX10-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD3]](s32)
+ ; GFX10-NEXT: [[INT11:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC3]](s16), [[TRUNC7]](s16), [[TRUNC3]](s16)
+ ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT2]](s16), [[INT5]](s16)
+ ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[INT8]](s16), [[INT11]](s16)
; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
@@ -2052,6 +2310,7 @@ body: |
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s16_constant_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2060,6 +2319,7 @@ body: |
; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2068,6 +2328,7 @@ body: |
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2076,6 +2337,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2122,6 +2384,7 @@ body: |
; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[INT6]](s32)
; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2131,6 +2394,7 @@ body: |
; VI-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2140,6 +2404,7 @@ body: |
; GFX9-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2149,6 +2414,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
; GFX9-UNSAFE-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2190,6 +2456,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_constant_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2208,6 +2475,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_constant_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2226,12 +2494,14 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
; GFX9-UNSAFE-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_constant_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2281,6 +2551,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; SI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; VI-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; VI: liveins: $vgpr0
; VI-NEXT: {{ $}}
@@ -2300,6 +2571,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; VI-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; GFX9: liveins: $vgpr0
; GFX9-NEXT: {{ $}}
@@ -2319,6 +2591,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY]](s32), [[C]](s32)
; GFX9-NEXT: $vgpr0 = COPY [[INT6]](s32)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2326,6 +2599,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
; GFX9-UNSAFE-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s32)
; GFX9-UNSAFE-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ;
; GFX10-LABEL: name: test_fdiv_s32_constant_negative_one_rcp
; GFX10: liveins: $vgpr0
; GFX10-NEXT: {{ $}}
@@ -2389,6 +2663,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; VI-LABEL: name: test_fdiv_s64_constant_one_rcp
; VI: liveins: $vgpr0_vgpr1
; VI-NEXT: {{ $}}
@@ -2407,6 +2682,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX9: liveins: $vgpr0_vgpr1
; GFX9-NEXT: {{ $}}
@@ -2425,6 +2701,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2440,6 +2717,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+ ;
; GFX10-LABEL: name: test_fdiv_s64_constant_one_rcp
; GFX10: liveins: $vgpr0_vgpr1
; GFX10-NEXT: {{ $}}
@@ -2503,6 +2781,7 @@ body: |
; SI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[XOR]](s1)
; SI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; SI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; VI-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; VI: liveins: $vgpr0_vgpr1
; VI-NEXT: {{ $}}
@@ -2522,6 +2801,7 @@ body: |
; VI-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; VI-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; VI-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX9: liveins: $vgpr0_vgpr1
; GFX9-NEXT: {{ $}}
@@ -2541,6 +2821,7 @@ body: |
; GFX9-NEXT: [[INT5:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s64), [[FMA3]](s64), [[FMUL]](s64), [[INT4]](s1)
; GFX9-NEXT: [[INT6:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s64), [[COPY]](s64), [[C]](s64)
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INT6]](s64)
+ ;
; GFX9-UNSAFE-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX9-UNSAFE: liveins: $vgpr0_vgpr1
; GFX9-UNSAFE-NEXT: {{ $}}
@@ -2557,6 +2838,7 @@ body: |
; GFX9-UNSAFE-NEXT: [[FMA4:%[0-9]+]]:_(s64) = G_FMA [[FNEG]], [[FMUL]], [[C]]
; GFX9-UNSAFE-NEXT: [[FMA5:%[0-9]+]]:_(s64) = G_FMA [[FMA4]], [[FMA3]], [[FMUL]]
; GFX9-UNSAFE-NEXT: $vgpr0_vgpr1 = COPY [[FMA5]](s64)
+ ;
; GFX10-LABEL: name: test_fdiv_s64_constant_negative_one_rcp
; GFX10: liveins: $vgpr0_vgpr1
; GFX10-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 7c89efd0a713c1..b8421c81421206 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -60,15 +60,21 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: flat_load_ushort v2, v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, s5
+; GFX8-NEXT: v_mov_b32_e32 v6, s5
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v5
; GFX8-NEXT: v_cvt_f32_f16_e32 v0, v2
-; GFX8-NEXT: v_rcp_f32_e32 v0, v0
-; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX8-NEXT: v_cvt_f16_f32_e32 v6, v0
+; GFX8-NEXT: v_rcp_f32_e32 v3, v0
+; GFX8-NEXT: v_mul_f32_e32 v7, v1, v3
+; GFX8-NEXT: v_fma_f32 v8, -v0, v7, v1
+; GFX8-NEXT: v_fma_f32 v7, v8, v3, v7
+; GFX8-NEXT: v_fma_f32 v0, -v0, v7, v1
+; GFX8-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX8-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v4
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_div_fixup_f16 v2, v6, v2, v5
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc
+; GFX8-NEXT: v_div_fixup_f16 v2, v3, v2, v5
; GFX8-NEXT: flat_store_short v[0:1], v2
; GFX8-NEXT: s_endpgm
;
@@ -82,9 +88,17 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] glc
; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-NEXT: v_mul_f32_e32 v6, v4, v5
+; GFX9-NEXT: v_fma_f32 v7, -v3, v6, v4
+; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX9-NEXT: v_fma_f32 v3, -v3, v6, v4
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX9-NEXT: global_store_short v0, v1, s[4:5]
; GFX9-NEXT: s_endpgm
@@ -100,9 +114,17 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] glc dlc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v4, v5, v3
+; GFX10-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX10-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
@@ -120,11 +142,23 @@ define amdgpu_kernel void @v_fdiv_f16(
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] glc dlc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v4, v5, v3
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v2, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index 301299daaa61f4..9f4744ae989028 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1444,12 +1444,19 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_lshlrev_b32_e64 v0, v0, 1
; VI-NEXT: v_cvt_f32_u32_e32 v0, v0
-; VI-NEXT: s_movk_i32 s4, 0x7000
+; VI-NEXT: s_mov_b32 s4, 0x46000000
; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT: v_rcp_f32_e32 v1, v1
-; VI-NEXT: v_mul_f32_e32 v1, 0x46000000, v1
+; VI-NEXT: v_rcp_f32_e32 v2, v1
+; VI-NEXT: v_mul_f32_e32 v3, 0x46000000, v2
+; VI-NEXT: v_fma_f32 v4, -v1, v3, s4
+; VI-NEXT: v_fma_f32 v3, v4, v2, v3
+; VI-NEXT: v_fma_f32 v1, -v1, v3, s4
+; VI-NEXT: v_mul_f32_e32 v1, v1, v2
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; VI-NEXT: s_movk_i32 s4, 0x7000
; VI-NEXT: v_div_fixup_f16 v0, v1, v0, s4
; VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1462,7 +1469,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
-; GFX10-NEXT: v_fma_mixlo_f16 v1, v1, s4, 0
+; GFX10-NEXT: v_mul_f32_e32 v2, 0x46000000, v1
+; GFX10-NEXT: v_fma_mix_f32 v3, -v0, v2, s4 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX10-NEXT: v_fma_mix_f32 v3, -v0, v2, s4 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
@@ -1478,8 +1492,18 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v1, v1, s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v2, 0x46000000, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 0x7000
; GFX11-NEXT: s_setpc_b64 s[30:31]
%shl = shl nuw i32 1, %cnt
@@ -1551,8 +1575,14 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; VI-NEXT: v_lshlrev_b16_e64 v0, v0, 1
; VI-NEXT: v_cvt_f16_u16_e32 v0, v0
; VI-NEXT: v_cvt_f32_f16_e32 v1, v0
-; VI-NEXT: v_rcp_f32_e32 v1, v1
-; VI-NEXT: v_add_f32_e32 v1, v1, v1
+; VI-NEXT: v_rcp_f32_e32 v2, v1
+; VI-NEXT: v_add_f32_e32 v3, v2, v2
+; VI-NEXT: v_fma_f32 v4, -v1, v3, 2.0
+; VI-NEXT: v_fma_f32 v3, v4, v2, v3
+; VI-NEXT: v_fma_f32 v1, -v1, v3, 2.0
+; VI-NEXT: v_mul_f32_e32 v1, v1, v2
+; VI-NEXT: v_frexp_mant_f32_e32 v1, v1
+; VI-NEXT: v_add_f32_e32 v1, v1, v3
; VI-NEXT: v_cvt_f16_f32_e32 v1, v1
; VI-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -1561,10 +1591,17 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b16 v0, v0, 1
+; GFX10-NEXT: s_mov_b32 s4, 2.0
; GFX10-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX10-NEXT: v_rcp_f32_e32 v1, v1
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX10-NEXT: v_add_f32_e32 v2, v1, v1
+; GFX10-NEXT: v_fma_mix_f32 v3, -v0, v2, s4 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX10-NEXT: v_fma_mix_f32 v3, -v0, v2, s4 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -1573,13 +1610,23 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b16 v0, v0, 1
+; GFX11-NEXT: s_mov_b32 s0, 2.0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_cvt_f16_u16_e32 v0, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_rcp_f32_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v1
+; GFX11-NEXT: v_add_f32_e32 v2, v1, v1
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v2, v3, v1
+; GFX11-NEXT: v_fma_mix_f32 v3, -v0, v2, s0 op_sel_hi:[1,0,0]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 2.0
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7c5d73ab66b47a..729493fe24c4f7 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -109,8 +109,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; VI-NEXT: v_cvt_f32_f16_e32 v3, v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v5, v2
-; VI-NEXT: v_rcp_f32_e32 v5, v5
-; VI-NEXT: v_mul_f32_e32 v3, v3, v5
+; VI-NEXT: v_rcp_f32_e32 v6, v5
+; VI-NEXT: v_mul_f32_e32 v7, v3, v6
+; VI-NEXT: v_fma_f32 v8, -v5, v7, v3
+; VI-NEXT: v_fma_f32 v7, v8, v6, v7
+; VI-NEXT: v_fma_f32 v3, -v5, v7, v3
+; VI-NEXT: v_mul_f32_e32 v3, v3, v6
+; VI-NEXT: v_frexp_mant_f32_e32 v3, v3
+; VI-NEXT: v_add_f32_e32 v3, v3, v7
; VI-NEXT: v_cvt_f16_f32_e32 v3, v3
; VI-NEXT: v_div_fixup_f16 v3, v3, v2, v4
; VI-NEXT: v_trunc_f16_e32 v3, v3
@@ -126,10 +132,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] offset:8
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX9-NEXT: v_rcp_f32_e32 v5, v4
+; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX9-NEXT: v_trunc_f16_e32 v3, v3
; GFX9-NEXT: v_fma_f16 v1, -v3, v2, v1
@@ -146,10 +161,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_ushort v1, v0, s[6:7]
; GFX10-NEXT: global_load_ushort v2, v0, s[0:1] offset:8
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT: v_rcp_f32_e32 v4, v4
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX10-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
@@ -166,15 +190,28 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX11-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX11-NEXT: v_trunc_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX11-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
@@ -191,16 +228,29 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; GFX1150-NEXT: s_clause 0x1
; GFX1150-NEXT: global_load_u16 v1, v0, s[6:7]
; GFX1150-NEXT: global_load_u16 v2, v0, s[0:1] offset:8
+; GFX1150-NEXT: s_waitcnt vmcnt(1)
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX1150-NEXT: v_rcp_f32_e32 v3, v3
-; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
+; GFX1150-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v3, v5, v4
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX1150-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
; GFX1150-NEXT: global_store_b16 v0, v1, s[4:5]
; GFX1150-NEXT: s_nop 0
@@ -1974,8 +2024,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v2
; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT: v_rcp_f32_e32 v7, v7
-; VI-NEXT: v_mul_f32_e32 v5, v5, v7
+; VI-NEXT: v_rcp_f32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v9, v5, v8
+; VI-NEXT: v_fma_f32 v10, -v7, v9, v5
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v5, -v7, v9, v5
+; VI-NEXT: v_mul_f32_e32 v5, v5, v8
+; VI-NEXT: v_frexp_mant_f32_e32 v5, v5
+; VI-NEXT: v_add_f32_e32 v5, v5, v9
; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v5, v5, v6, v3
; VI-NEXT: v_trunc_f16_e32 v5, v5
@@ -1983,8 +2039,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
; VI-NEXT: v_cvt_f32_f16_e32 v5, v4
; VI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; VI-NEXT: v_rcp_f32_e32 v6, v6
-; VI-NEXT: v_mul_f32_e32 v5, v5, v6
+; VI-NEXT: v_rcp_f32_e32 v7, v6
+; VI-NEXT: v_mul_f32_e32 v8, v5, v7
+; VI-NEXT: v_fma_f32 v9, -v6, v8, v5
+; VI-NEXT: v_fma_f32 v8, v9, v7, v8
+; VI-NEXT: v_fma_f32 v5, -v6, v8, v5
+; VI-NEXT: v_mul_f32_e32 v5, v5, v7
+; VI-NEXT: v_frexp_mant_f32_e32 v5, v5
+; VI-NEXT: v_add_f32_e32 v5, v5, v8
; VI-NEXT: v_cvt_f16_f32_e32 v5, v5
; VI-NEXT: v_div_fixup_f16 v5, v5, v2, v4
; VI-NEXT: v_trunc_f16_e32 v5, v5
@@ -2001,18 +2063,35 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[6:7]
; GFX9-NEXT: global_load_dword v2, v0, s[0:1] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX9-NEXT: v_rcp_f32_e32 v5, v4
+; GFX9-NEXT: v_mul_f32_e32 v6, v3, v5
+; GFX9-NEXT: v_fma_f32 v7, -v4, v6, v3
+; GFX9-NEXT: v_fma_f32 v6, v7, v5, v6
+; GFX9-NEXT: v_fma_f32 v3, -v4, v6, v3
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX9-NEXT: v_trunc_f16_e32 v3, v3
; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX9-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-NEXT: v_mad_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX9-NEXT: v_rcp_f32_e32 v6, v5
+; GFX9-NEXT: v_mul_f32_e32 v7, v4, v6
+; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v4
+; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7
+; GFX9-NEXT: v_fma_f32 v4, -v5, v7, v4
+; GFX9-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX9-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX9-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX9-NEXT: v_div_fixup_f16 v4, v4, v2, v1
; GFX9-NEXT: v_trunc_f16_e32 v4, v4
; GFX9-NEXT: v_fma_f16 v1, -v4, v2, v1
@@ -2030,21 +2109,38 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dword v1, v0, s[6:7]
; GFX10-NEXT: global_load_dword v2, v0, s[0:1] offset:16
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_rcp_f32_e32 v4, v4
+; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX10-NEXT: v_rcp_f32_e32 v7, v7
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX10-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX10-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v4
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
+; GFX10-NEXT: v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX10-NEXT: v_rcp_f32_e32 v4, v4
-; GFX10-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX10-NEXT: v_trunc_f16_e32 v4, v4
-; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1
+; GFX10-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX10-NEXT: v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v7
+; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v1, v1, v6, v4
+; GFX10-NEXT: v_trunc_f16_e32 v1, v1
+; GFX10-NEXT: v_fma_f16 v1, -v1, v6, v4
; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
@@ -2059,28 +2155,52 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX11-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v7, v7
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v3, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v4, v5, v4
+; GFX11-NEXT: v_frexp_mant_f32_e32 v4, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v7
; GFX11-NEXT: v_trunc_f16_e32 v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fma_mix_f32 v8, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX11-NEXT: v_fma_mix_f32 v1, -v2, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v7
+; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fixup_f16 v4, v4, v2, v1
-; GFX11-NEXT: v_trunc_f16_e32 v4, v4
+; GFX11-NEXT: v_div_fixup_f16 v1, v1, v6, v4
+; GFX11-NEXT: v_trunc_f16_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fma_f16 v1, -v4, v2, v1
+; GFX11-NEXT: v_fma_f16 v1, -v1, v6, v4
; GFX11-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX11-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX11-NEXT: s_nop 0
@@ -2098,31 +2218,55 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: global_load_b32 v1, v0, s[6:7]
; GFX1150-NEXT: global_load_b32 v2, v0, s[0:1] offset:16
; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v1
+; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX1150-NEXT: v_rcp_f32_e32 v4, v4
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v4, v1, v4, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v3, v5
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v4, v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_frexp_mant_f32_e32 v6, v6
+; GFX1150-NEXT: v_add_f32_e32 v4, v6, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v5, v3
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v5, v4, v3
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v3, v3
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1150-NEXT: v_fmac_f16_e32 v3, v4, v5
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v3, v1, v3, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v3, v3, v2, v1
+; GFX1150-NEXT: v_mul_f32_e32 v4, v4, v5
+; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v3, v3
-; GFX1150-NEXT: v_xor_b32_e32 v3, 0x8000, v3
+; GFX1150-NEXT: v_fmac_f32_e32 v4, v6, v5
+; GFX1150-NEXT: v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v3, v2
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v5
+; GFX1150-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX1150-NEXT: v_frexp_mant_f32_e32 v5, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_add_f32_e32 v4, v5, v4
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_div_fixup_f16 v4, v4, v2, v1
+; GFX1150-NEXT: v_trunc_f16_e32 v4, v4
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_xor_b32_e32 v4, 0x8000, v4
+; GFX1150-NEXT: v_fmac_f16_e32 v1, v4, v2
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v3
; GFX1150-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX1150-NEXT: s_nop 0
; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -2364,8 +2508,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v6, 16, v3
; VI-NEXT: v_cvt_f32_f16_e32 v7, v6
-; VI-NEXT: v_rcp_f32_e32 v9, v9
-; VI-NEXT: v_mul_f32_e32 v7, v7, v9
+; VI-NEXT: v_rcp_f32_e32 v10, v9
+; VI-NEXT: v_mul_f32_e32 v11, v7, v10
+; VI-NEXT: v_fma_f32 v12, -v9, v11, v7
+; VI-NEXT: v_fma_f32 v11, v12, v10, v11
+; VI-NEXT: v_fma_f32 v7, -v9, v11, v7
+; VI-NEXT: v_mul_f32_e32 v7, v7, v10
+; VI-NEXT: v_frexp_mant_f32_e32 v7, v7
+; VI-NEXT: v_add_f32_e32 v7, v7, v11
; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
; VI-NEXT: v_div_fixup_f16 v7, v7, v8, v6
; VI-NEXT: v_trunc_f16_e32 v7, v7
@@ -2373,8 +2523,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v8, v5
; VI-NEXT: v_cvt_f32_f16_e32 v7, v3
; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; VI-NEXT: v_rcp_f32_e32 v8, v8
-; VI-NEXT: v_mul_f32_e32 v7, v7, v8
+; VI-NEXT: v_rcp_f32_e32 v9, v8
+; VI-NEXT: v_mul_f32_e32 v10, v7, v9
+; VI-NEXT: v_fma_f32 v11, -v8, v10, v7
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v7, -v8, v10, v7
+; VI-NEXT: v_mul_f32_e32 v7, v7, v9
+; VI-NEXT: v_frexp_mant_f32_e32 v7, v7
+; VI-NEXT: v_add_f32_e32 v7, v7, v10
; VI-NEXT: v_cvt_f16_f32_e32 v7, v7
; VI-NEXT: v_div_fixup_f16 v7, v7, v5, v3
; VI-NEXT: v_trunc_f16_e32 v7, v7
@@ -2384,8 +2540,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v2
; VI-NEXT: v_or_b32_e32 v3, v3, v6
; VI-NEXT: v_cvt_f32_f16_e32 v6, v5
-; VI-NEXT: v_rcp_f32_e32 v8, v8
-; VI-NEXT: v_mul_f32_e32 v6, v6, v8
+; VI-NEXT: v_rcp_f32_e32 v9, v8
+; VI-NEXT: v_mul_f32_e32 v10, v6, v9
+; VI-NEXT: v_fma_f32 v11, -v8, v10, v6
+; VI-NEXT: v_fma_f32 v10, v11, v9, v10
+; VI-NEXT: v_fma_f32 v6, -v8, v10, v6
+; VI-NEXT: v_mul_f32_e32 v6, v6, v9
+; VI-NEXT: v_frexp_mant_f32_e32 v6, v6
+; VI-NEXT: v_add_f32_e32 v6, v6, v10
; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
; VI-NEXT: v_div_fixup_f16 v6, v6, v7, v5
; VI-NEXT: v_trunc_f16_e32 v6, v6
@@ -2393,8 +2555,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-NEXT: v_cvt_f32_f16_e32 v7, v4
; VI-NEXT: v_cvt_f32_f16_e32 v6, v2
; VI-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; VI-NEXT: v_rcp_f32_e32 v7, v7
-; VI-NEXT: v_mul_f32_e32 v6, v6, v7
+; VI-NEXT: v_rcp_f32_e32 v8, v7
+; VI-NEXT: v_mul_f32_e32 v9, v6, v8
+; VI-NEXT: v_fma_f32 v10, -v7, v9, v6
+; VI-NEXT: v_fma_f32 v9, v10, v8, v9
+; VI-NEXT: v_fma_f32 v6, -v7, v9, v6
+; VI-NEXT: v_mul_f32_e32 v6, v6, v8
+; VI-NEXT: v_frexp_mant_f32_e32 v6, v6
+; VI-NEXT: v_add_f32_e32 v6, v6, v9
; VI-NEXT: v_cvt_f16_f32_e32 v6, v6
; VI-NEXT: v_div_fixup_f16 v6, v6, v4, v2
; VI-NEXT: v_trunc_f16_e32 v6, v6
@@ -2411,33 +2579,66 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32
+; GFX9-NEXT: s_waitcnt vmcnt(1)
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX9-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-NEXT: v_mad_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX9-NEXT: v_rcp_f32_e32 v7, v6
+; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5
+; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8
+; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5
+; GFX9-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX9-NEXT: v_frexp_mant_f32_e32 v5, v5
+; GFX9-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX9-NEXT: v_div_fixup_f16 v5, v5, v3, v1
; GFX9-NEXT: v_trunc_f16_e32 v5, v5
; GFX9-NEXT: v_fma_f16 v5, -v5, v3, v1
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX9-NEXT: v_rcp_f32_e32 v6, v6
-; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v3
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v1
+; GFX9-NEXT: v_rcp_f32_e32 v8, v7
+; GFX9-NEXT: v_mul_f32_e32 v9, v6, v8
+; GFX9-NEXT: v_fma_f32 v10, -v7, v9, v6
+; GFX9-NEXT: v_fma_f32 v9, v10, v8, v9
+; GFX9-NEXT: v_fma_f32 v6, -v7, v9, v6
+; GFX9-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX9-NEXT: v_frexp_mant_f32_e32 v6, v6
+; GFX9-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX9-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX9-NEXT: v_div_fixup_f16 v6, v6, v3, v1
; GFX9-NEXT: v_trunc_f16_e32 v6, v6
; GFX9-NEXT: v_fma_f16 v1, -v6, v3, v1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
; GFX9-NEXT: v_pack_b32_f16 v1, v5, v1
-; GFX9-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v6, v5
+; GFX9-NEXT: v_mul_f32_e32 v7, v3, v6
+; GFX9-NEXT: v_fma_f32 v8, -v5, v7, v3
+; GFX9-NEXT: v_fma_f32 v7, v8, v6, v7
+; GFX9-NEXT: v_fma_f32 v3, -v5, v7, v3
+; GFX9-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-NEXT: v_frexp_mant_f32_e32 v3, v3
+; GFX9-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-NEXT: v_div_fixup_f16 v3, v3, v2, v0
; GFX9-NEXT: v_trunc_f16_e32 v3, v3
; GFX9-NEXT: v_fma_f16 v3, -v3, v2, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX9-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-NEXT: v_mad_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX9-NEXT: v_rcp_f32_e32 v7, v6
+; GFX9-NEXT: v_mul_f32_e32 v8, v5, v7
+; GFX9-NEXT: v_fma_f32 v9, -v6, v8, v5
+; GFX9-NEXT: v_fma_f32 v8, v9, v7, v8
+; GFX9-NEXT: v_fma_f32 v5, -v6, v8, v5
+; GFX9-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX9-NEXT: v_frexp_mant_f32_e32 v5, v5
+; GFX9-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX9-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX9-NEXT: v_div_fixup_f16 v5, v5, v2, v0
; GFX9-NEXT: v_trunc_f16_e32 v5, v5
; GFX9-NEXT: v_fma_f16 v0, -v5, v2, v0
@@ -2455,36 +2656,69 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:32
+; GFX10-NEXT: s_waitcnt vmcnt(1)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX10-NEXT: v_rcp_f32_e32 v5, v5
-; GFX10-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX10-NEXT: v_rcp_f32_e32 v6, v6
+; GFX10-NEXT: v_cvt_f32_f16_e32 v9, v8
+; GFX10-NEXT: v_rcp_f32_e32 v9, v9
+; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX10-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX10-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX10-NEXT: v_frexp_mant_f32_e32 v6, v6
+; GFX10-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v6
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX10-NEXT: v_mul_f32_e32 v7, v7, v9
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
+; GFX10-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX10-NEXT: v_rcp_f32_e32 v6, v6
-; GFX10-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1
-; GFX10-NEXT: v_trunc_f16_e32 v6, v6
-; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX10-NEXT: v_fmac_f32_e32 v7, v10, v9
+; GFX10-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX10-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX10-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v1, v1, v8, v6
+; GFX10-NEXT: v_trunc_f16_e32 v1, v1
+; GFX10-NEXT: v_fma_f16 v1, -v1, v8, v6
+; GFX10-NEXT: v_cvt_f32_f16_e32 v8, v7
; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1
-; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
+; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX10-NEXT: v_rcp_f32_e32 v8, v8
+; GFX10-NEXT: v_rcp_f32_e32 v5, v5
+; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX10-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_fmac_f32_e32 v3, v6, v5
+; GFX10-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v5, v6, v5
+; GFX10-NEXT: v_frexp_mant_f32_e32 v5, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v5
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX10-NEXT: v_mul_f32_e32 v6, v6, v8
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
+; GFX10-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX10-NEXT: v_rcp_f32_e32 v5, v5
-; GFX10-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
-; GFX10-NEXT: v_trunc_f16_e32 v5, v5
-; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0
+; GFX10-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX10-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX10-NEXT: v_mul_f32_e32 v0, v0, v8
+; GFX10-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v6
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, v7, v5
+; GFX10-NEXT: v_trunc_f16_e32 v0, v0
+; GFX10-NEXT: v_fma_f16 v0, -v0, v7, v5
; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
@@ -2499,50 +2733,97 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[0:1], v4, s[6:7]
; GFX11-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
+; GFX11-NEXT: s_waitcnt vmcnt(1)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v6, v6
+; GFX11-NEXT: v_cvt_f32_f16_e32 v9, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v5, v5
+; GFX11-NEXT: v_rcp_f32_e32 v9, v9
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v5, v7, v6
+; GFX11-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX11-NEXT: v_frexp_mant_f32_e32 v6, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX11-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_mul_f32_e32 v7, v7, v9
; GFX11-NEXT: v_trunc_f16_e32 v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_fma_mix_f32 v10, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
; GFX11-NEXT: v_fma_f16 v5, -v5, v3, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX11-NEXT: v_rcp_f32_e32 v6, v6
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v6, v1, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fmac_f32_e32 v7, v10, v9
+; GFX11-NEXT: v_fma_mix_f32 v1, -v3, v7, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mul_f32_e32 v1, v1, v9
+; GFX11-NEXT: v_frexp_mant_f32_e32 v1, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fixup_f16 v6, v6, v3, v1
-; GFX11-NEXT: v_trunc_f16_e32 v6, v6
+; GFX11-NEXT: v_div_fixup_f16 v1, v1, v8, v6
+; GFX11-NEXT: v_trunc_f16_e32 v1, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_fma_f16 v1, -v6, v3, v1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX11-NEXT: v_fma_f16 v1, -v1, v8, v6
+; GFX11-NEXT: v_cvt_f32_f16_e32 v8, v7
; GFX11-NEXT: v_pack_b32_f16 v1, v5, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_rcp_f32_e32 v8, v8
+; GFX11-NEXT: v_rcp_f32_e32 v5, v5
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v3, v0, v3, 0 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX11-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v3, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_fma_mix_f32 v6, -v2, v3, v0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v5, v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_frexp_mant_f32_e32 v5, v5
+; GFX11-NEXT: v_add_f32_e32 v3, v5, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_div_fixup_f16 v3, v3, v2, v0
+; GFX11-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_trunc_f16_e32 v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_fma_f16 v3, -v3, v2, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v9, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX11-NEXT: v_rcp_f32_e32 v5, v5
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_fma_mix_f32 v0, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fixup_f16 v5, v5, v2, v0
-; GFX11-NEXT: v_trunc_f16_e32 v5, v5
+; GFX11-NEXT: v_frexp_mant_f32_e32 v0, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v7, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fma_f16 v0, -v5, v2, v0
+; GFX11-NEXT: v_trunc_f16_e32 v0, v0
+; GFX11-NEXT: v_fma_f16 v0, -v0, v7, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX11-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX11-NEXT: s_nop 0
@@ -2560,55 +2841,102 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX1150-NEXT: global_load_b64 v[0:1], v4, s[6:7]
; GFX1150-NEXT: global_load_b64 v[2:3], v4, s[0:1] offset:32
; GFX1150-NEXT: s_waitcnt vmcnt(1)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v0
+; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v0
; GFX1150-NEXT: s_waitcnt vmcnt(0)
-; GFX1150-NEXT: v_lshrrev_b32_e32 v5, 16, v2
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_lshrrev_b32_e32 v7, 16, v2
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v6, v0, v6, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v5, v7
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v8, v7
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT: v_rcp_f32_e32 v8, v8
+; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v6, v9, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v8, v9, v8
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_frexp_mant_f32_e32 v8, v8
+; GFX1150-NEXT: v_add_f32_e32 v6, v8, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v7, v5
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fmac_f16_e32 v7, v6, v5
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT: v_fmac_f16_e32 v5, v6, v7
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v5, v0, v5, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v0
+; GFX1150-NEXT: v_mul_f32_e32 v6, v6, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
-; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1150-NEXT: v_fma_f16 v0, v5, v2, v0
-; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v3
-; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v7
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fmac_f32_e32 v6, v8, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-NEXT: v_frexp_mant_f32_e32 v7, v7
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_add_f32_e32 v6, v7, v6
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_div_fixup_f16 v6, v6, v2, v0
+; GFX1150-NEXT: v_trunc_f16_e32 v6, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_xor_b32_e32 v6, 0x8000, v6
+; GFX1150-NEXT: v_fma_f16 v0, v6, v2, v0
+; GFX1150-NEXT: v_lshrrev_b32_e32 v6, 16, v3
+; GFX1150-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1150-NEXT: v_pack_b32_f16 v0, v0, v5
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX1150-NEXT: v_rcp_f32_e32 v5, v5
+; GFX1150-NEXT: v_rcp_f32_e32 v7, v7
; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v5, v1, v5, 0 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v2, v6
+; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fmac_f32_e32 v5, v8, v7
+; GFX1150-NEXT: v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_mul_f32_e32 v7, v8, v7
+; GFX1150-NEXT: v_frexp_mant_f32_e32 v7, v7
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_add_f32_e32 v5, v7, v5
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v6, v2
; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
-; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v6, v5, v2
-; GFX1150-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX1150-NEXT: v_rcp_f32_e32 v2, v2
-; GFX1150-NEXT: s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fma_mixlo_f16 v2, v1, v2, 0 op_sel_hi:[1,0,0]
-; GFX1150-NEXT: v_div_fixup_f16 v2, v2, v3, v1
+; GFX1150-NEXT: v_fmac_f16_e32 v2, v5, v6
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX1150-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1150-NEXT: v_rcp_f32_e32 v6, v6
+; GFX1150-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_fmac_f32_e32 v5, v7, v6
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_trunc_f16_e32 v2, v2
-; GFX1150-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; GFX1150-NEXT: v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1150-NEXT: v_mul_f32_e32 v6, v7, v6
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_frexp_mant_f32_e32 v6, v6
+; GFX1150-NEXT: v_add_f32_e32 v5, v6, v5
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX1150-NEXT: v_div_fixup_f16 v5, v5, v3, v1
+; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1150-NEXT: v_trunc_f16_e32 v5, v5
+; GFX1150-NEXT: v_xor_b32_e32 v5, 0x8000, v5
; GFX1150-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-NEXT: v_fmac_f16_e32 v1, v2, v3
-; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v6
+; GFX1150-NEXT: v_fmac_f16_e32 v1, v5, v3
+; GFX1150-NEXT: v_pack_b32_f16 v1, v1, v2
; GFX1150-NEXT: global_store_b64 v4, v[0:1], s[4:5]
; GFX1150-NEXT: s_nop 0
; GFX1150-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
More information about the llvm-commits
mailing list