[llvm] 59c311c - AMDGPU: Add more tests for f16 fdiv lowering
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 5 13:53:07 PDT 2023
Author: Matt Arsenault
Date: 2023-07-05T16:53:01-04:00
New Revision: 59c311c5d4a04a6a4f8c4abf140a63af1079e34c
URL: https://github.com/llvm/llvm-project/commit/59c311c5d4a04a6a4f8c4abf140a63af1079e34c
DIFF: https://github.com/llvm/llvm-project/commit/59c311c5d4a04a6a4f8c4abf140a63af1079e34c.diff
LOG: AMDGPU: Add more tests for f16 fdiv lowering
Probably should merge the DAG and gisel tests.
Added:
Modified:
llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
llvm/test/CodeGen/AMDGPU/fdiv.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index f4830eeaa1b600..b5819f8214155d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -209,6 +209,85 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
ret half %fdiv
}
+define half @v_neg_rcp_f16(half %x) {
+; GFX6-IEEE-LABEL: v_neg_rcp_f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rcp_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rcp_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rcp_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv half -1.0, %x
+ ret half %fdiv
+}
+
define half @v_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
@@ -1087,11 +1166,11 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
ret <2 x half> %fdiv
}
-define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
-; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
+define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
+; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
@@ -1120,10 +1199,10 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
+; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
@@ -1137,7 +1216,7 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
@@ -1158,49 +1237,49 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_rcp_v2f16_arcp:
+; GFX8-LABEL: v_neg_rcp_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-NEXT: v_rcp_f32_e32 v1, v1
; GFX8-NEXT: v_rcp_f32_e32 v3, v3
; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_rcp_v2f16_arcp:
+; GFX9-LABEL: v_neg_rcp_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-NEXT: v_rcp_f32_e32 v3, v3
; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
-; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_rcp_v2f16_arcp:
+; GFX10-LABEL: v_neg_rcp_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
@@ -1208,17 +1287,17 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_rcp_v2f16_arcp:
+; GFX11-LABEL: v_neg_rcp_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
@@ -1227,107 +1306,229 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
+ %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
ret <2 x half> %fdiv
}
-define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
-; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
+define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
+; GFX6-IEEE-LABEL: v_rcp_v2f16_fabs:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
+; GFX6-FLUSH-LABEL: v_rcp_v2f16_fabs:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rcp_v2f16_fabs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v1, v0
-; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
+; GFX9-LABEL: v_rcp_v2f16_fabs:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
+; GFX10-LABEL: v_rcp_v2f16_fabs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
+; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_rcp_f16_e32 v0, v0
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
+ %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
+ %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x.fabs
ret <2 x half> %fdiv
}
-define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
-; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
+define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
+; GFX6-IEEE-LABEL: v_neg_rcp_v2f16_fabs:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
+; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16_fabs:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
@@ -1337,342 +1538,2334 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_rcp_v2f16_ulp25:
+; GFX8-LABEL: v_neg_rcp_v2f16_fabs:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v1, v0
-; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_rcp_v2f16_ulp25:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_neg_rcp_v2f16_fabs:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_rcp_v2f16_ulp25:
+; GFX10-LABEL: v_neg_rcp_v2f16_fabs:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v1, v0
-; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_rcp_v2f16_ulp25:
+; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT: v_rcp_f16_e32 v0, v0
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0
+ %x.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x)
+ %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x.fabs
ret <2 x half> %fdiv
}
-define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
+define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
+; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rcp_v2f16_arcp:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rcp_v2f16_arcp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rcp_v2f16_arcp:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rcp_v2f16_arcp:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
+; GFX6-LABEL: v_rcp_v2f16_arcp_afn:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rcp_f32_e32 v2, v2
-; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-NEXT: v_rcp_f32_e32 v0, v0
+; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX8-LABEL: v_rcp_v2f16_arcp_afn:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v2, v1
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_rcp_f16_e32 v1, v0
+; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9-LABEL: v_rcp_v2f16_arcp_afn:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: v_rcp_f16_e32 v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rcp_v2f16_arcp_afn:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_rcp_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rcp_v2f16_arcp_afn:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_rcp_f16_e32 v0, v0
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv arcp afn <2 x half> <half 1.0, half 1.0>, %x
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
+; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rcp_v2f16_ulp25:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rcp_f16_e32 v1, v0
+; GFX8-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rcp_v2f16_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v1, v0
+; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rcp_v2f16_ulp25:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_rcp_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rcp_v2f16_ulp25:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_rcp_f16_e32 v0, v0
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
+; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_f32_e32 v3, v3
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rcp_f16_e32 v2, v1
+; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
+; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_rcp_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX8-NEXT: v_rcp_f32_e32 v5, v5
+; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
+; GFX10-NEXT: v_rcp_f32_e32 v4, v4
+; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
+; GFX11-NEXT: v_rcp_f32_e32 v4, v4
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
+; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
+; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_f32_e32 v3, v3
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rcp_f16_e32 v2, v1
+; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_rcp_f16_e32 v2, v1
+; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_rcp_f16_e32 v1, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
+ ret <2 x half> %fdiv
+}
+
+define amdgpu_ps i16 @s_fdiv_f16(i16 inreg %a.arg, i16 inreg %b.arg) {
+; GFX6-IEEE-LABEL: s_fdiv_f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX6-FLUSH-LABEL: s_fdiv_f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-FLUSH-NEXT: ; return to shader part epilog
+;
+; GFX89-LABEL: s_fdiv_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX89-NEXT: v_rcp_f32_e32 v0, v0
+; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_div_fixup_f16 v0, v0, v1, s0
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fdiv_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX10-NEXT: v_rcp_f32_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_fdiv_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX11-NEXT: v_rcp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i16 %a.arg to half
+ %b = bitcast i16 %b.arg to half
+ %fdiv = fdiv half %a, %b
+ %result = bitcast half %fdiv to i16
+ ret i16 %result
+}
+
+define amdgpu_ps i16 @s_fdiv_f16_arcp(i16 inreg %a.arg, i16 inreg %b.arg) {
+; GFX6-IEEE-LABEL: s_fdiv_f16_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX6-FLUSH-LABEL: s_fdiv_f16_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-FLUSH-NEXT: ; return to shader part epilog
+;
+; GFX89-LABEL: s_fdiv_f16_arcp:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX89-NEXT: v_rcp_f32_e32 v0, v0
+; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT: v_mov_b32_e32 v1, s1
+; GFX89-NEXT: v_div_fixup_f16 v0, v0, v1, s0
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fdiv_f16_arcp:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX10-NEXT: v_rcp_f32_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_fdiv_f16_arcp:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX11-NEXT: v_rcp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i16 %a.arg to half
+ %b = bitcast i16 %b.arg to half
+ %fdiv = fdiv arcp half %a, %b
+ %result = bitcast half %fdiv to i16
+ ret i16 %result
+}
+
+define amdgpu_ps i16 @s_fdiv_f16_afn(i16 inreg %a.arg, i16 inreg %b.arg) {
+; GFX6-LABEL: s_fdiv_f16_afn:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-NEXT: v_rcp_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: ; return to shader part epilog
+;
+; GFX89-LABEL: s_fdiv_f16_afn:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: v_rcp_f16_e32 v0, s1
+; GFX89-NEXT: v_mul_f16_e32 v0, s0, v0
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fdiv_f16_afn:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_rcp_f16_e32 v0, s1
+; GFX10-NEXT: v_mul_f16_e32 v0, s0, v0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_fdiv_f16_afn:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_rcp_f16_e32 v0, s1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f16_e32 v0, s0, v0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i16 %a.arg to half
+ %b = bitcast i16 %b.arg to half
+ %fdiv = fdiv afn half %a, %b
+ %result = bitcast half %fdiv to i16
+ ret i16 %result
+}
+
+define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
+; GFX6-IEEE-LABEL: s_fdiv_v2f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s1
+; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
+; GFX6-IEEE-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v6, s1
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v6, v6, v4
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, vcc, v4, v6, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v2, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v5, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v3, v5, v1
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v6, v4
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX6-FLUSH-LABEL: s_fdiv_v2f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s1
+; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
+; GFX6-FLUSH-NEXT: s_lshr_b32 s1, s1, 16
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[2:3], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v4, v4, v3
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v3, v4, v3
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v4, v3
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-FLUSH-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_fdiv_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX8-NEXT: s_lshr_b32 s2, s1, 16
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, s2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX8-NEXT: v_rcp_f32_e32 v0, v0
+; GFX8-NEXT: s_lshr_b32 s3, s0, 16
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, s3
+; GFX8-NEXT: v_rcp_f32_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX8-NEXT: v_mul_f32_e32 v1, v3, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, s1
+; GFX8-NEXT: v_div_fixup_f16 v0, v0, v2, s0
+; GFX8-NEXT: v_mov_b32_e32 v2, s2
+; GFX8-NEXT: v_div_fixup_f16 v1, v1, v2, s3
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_fdiv_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX9-NEXT: s_lshr_b32 s2, s1, 16
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, s2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX9-NEXT: v_rcp_f32_e32 v0, v0
+; GFX9-NEXT: s_lshr_b32 s3, s0, 16
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, s3
+; GFX9-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_mul_f32_e32 v1, v3, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_div_fixup_f16 v0, v0, v2, s0
+; GFX9-NEXT: v_mov_b32_e32 v2, s2
+; GFX9-NEXT: v_div_fixup_f16 v1, v1, v2, s3
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_fdiv_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshr_b32 s2, s1, 16
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX10-NEXT: s_lshr_b32 s3, s0, 16
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, s3
+; GFX10-NEXT: v_rcp_f32_e32 v0, v0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX10-NEXT: v_div_fixup_f16 v1, v1, s2, s3
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_fdiv_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshr_b32 s2, s1, 16
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, s2
+; GFX11-NEXT: s_lshr_b32 s3, s0, 16
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, s0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, s3
+; GFX11-NEXT: v_rcp_f32_e32 v0, v0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_dual_mul_f32 v0, v2, v0 :: v_dual_mul_f32 v1, v3, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, s1, s0
+; GFX11-NEXT: v_div_fixup_f16 v1, v1, s2, s3
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i32 %a.arg to <2 x half>
+ %b = bitcast i32 %b.arg to <2 x half>
+ %fdiv = fdiv <2 x half> %a, %b
+ %result = bitcast <2 x half> %fdiv to i32
+ ret i32 %result
+}
+
+define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
+; GFX6-IEEE-LABEL: s_rcp_f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX6-FLUSH-LABEL: s_rcp_f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-FLUSH-NEXT: ; return to shader part epilog
+;
+; GFX89-LABEL: s_rcp_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX89-NEXT: v_rcp_f32_e32 v0, v0
+; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT: v_div_fixup_f16 v0, v0, s0, 1.0
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_rcp_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX10-NEXT: v_rcp_f32_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, s0, 1.0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_rcp_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX11-NEXT: v_rcp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, s0, 1.0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i16 %a.arg to half
+ %fdiv = fdiv half 1.0, %a
+ %result = bitcast half %fdiv to i16
+ ret i16 %result
+}
+
+define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
+; GFX6-IEEE-LABEL: s_neg_rcp_f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX6-FLUSH-LABEL: s_neg_rcp_f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-FLUSH-NEXT: ; return to shader part epilog
+;
+; GFX89-LABEL: s_neg_rcp_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX89-NEXT: v_rcp_f32_e32 v0, v0
+; GFX89-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX89-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX89-NEXT: v_div_fixup_f16 v0, v0, s0, -1.0
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_neg_rcp_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX10-NEXT: v_rcp_f32_e32 v0, v0
+; GFX10-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX10-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX10-NEXT: v_div_fixup_f16 v0, v0, s0, -1.0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_neg_rcp_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX11-NEXT: v_rcp_f32_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, s0, -1.0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i16 %a.arg to half
+ %fdiv = fdiv half -1.0, %a
+ %result = bitcast half %fdiv to i16
+ ret i16 %result
+}
+
+define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
+; GFX6-IEEE-LABEL: s_rsq_f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX6-FLUSH-LABEL: s_rsq_f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-FLUSH-NEXT: ; return to shader part epilog
+;
+; GFX89-LABEL: s_rsq_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX89-NEXT: v_readfirstlane_b32 s0, v0
+; GFX89-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_rsq_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_rsq_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i16 %a.arg to half
+ %sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv half 1.0, %sqrt
+ %result = bitcast half %fdiv to i16
+ ret i16 %result
+}
+
+define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
+; GFX6-IEEE-LABEL: s_rsq_v2f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
+; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1]
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-IEEE-NEXT: ; return to shader part epilog
+;
+; GFX6-FLUSH-LABEL: s_rsq_v2f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-FLUSH-NEXT: ; return to shader part epilog
+;
+; GFX8-LABEL: s_rsq_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_lshr_b32 s1, s0, 16
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX8-NEXT: v_sqrt_f16_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: v_readfirstlane_b32 s0, v0
+; GFX8-NEXT: ; return to shader part epilog
+;
+; GFX9-LABEL: s_rsq_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX9-NEXT: s_lshr_b32 s0, s0, 16
+; GFX9-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX9-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_readfirstlane_b32 s0, v0
+; GFX9-NEXT: ; return to shader part epilog
+;
+; GFX10-LABEL: s_rsq_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_lshr_b32 s1, s0, 16
+; GFX10-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX10-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX10-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_readfirstlane_b32 s0, v0
+; GFX10-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: s_rsq_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_lshr_b32 s1, s0, 16
+; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
+; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11-NEXT: ; return to shader part epilog
+ %a = bitcast i32 %a.arg to <2 x half>
+ %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %sqrt
+ %result = bitcast <2 x half> %fdiv to i32
+ ret i32 %result
+}
+
+define half @v_rsq_f16(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rsq_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rsq_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv half 1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_neg_rsq_f16(half %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rsq_f16:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv half -1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_neg_rsq_f16_fabs(half %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_f16_fabs:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_f16_fabs:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rsq_f16_fabs:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e64 v0, |v0|
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_f16_fabs:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e64 v0, |v0|
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_f16_fabs:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e64 v0, |v0|
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %a.fabs = call half @llvm.fabs.f16(half %a)
+ %sqrt = call half @llvm.sqrt.f16(half %a.fabs)
+ %fdiv = fdiv half -1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_rsq_f16_arcp(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_arcp:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rsq_f16_arcp:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rsq_f16_arcp:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv arcp half 1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_neg_rsq_f16_arcp(half %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_f16_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_f16_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rsq_f16_arcp:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_f16_arcp:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX10-NEXT: v_rcp_f32_e32 v1, v1
+; GFX10-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX10-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX10-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_f16_arcp:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX11-NEXT: v_rcp_f32_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX11-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv arcp half -1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_rsq_f16_afn(half %a) {
+; GFX6-LABEL: v_rsq_f16_afn:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_rcp_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_afn:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rsq_f16_e32 v0, v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_rsq_f16_afn:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv afn half 1.0, %sqrt
+ ret half %fdiv
+}
+
+define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
+; GFX6-IEEE-LABEL: v_rsq_v2f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_v2f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rsq_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rsq_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX10-LABEL: v_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
+; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_rcp_f32_e32 v2, v2
+; GFX10-NEXT: v_rcp_f32_e32 v3, v3
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX11-LABEL: v_rsq_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_rcp_f16_e32 v2, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
+ %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %sqrt
ret <2 x half> %fdiv
}
-define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
+define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX8-LABEL: v_neg_rsq_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
-; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-NEXT: v_rcp_f32_e32 v2, v2
-; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6
-; GFX8-NEXT: v_rcp_f32_e32 v5, v5
-; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
+; GFX8-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX8-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, -1.0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX9-LABEL: v_neg_rsq_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-NEXT: v_rcp_f32_e32 v2, v2
-; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
-; GFX9-NEXT: v_rcp_f32_e32 v5, v5
-; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
-; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
-; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX10-LABEL: v_neg_rsq_v2f16:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX10-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX10-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX10-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX10-NEXT: v_rcp_f32_e32 v4, v4
-; GFX10-NEXT: v_mul_f32_e32 v3, v6, v3
-; GFX10-NEXT: v_mul_f32_e32 v4, v7, v4
+; GFX10-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX10-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX10-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX10-NEXT: v_div_fixup_f16 v1, v4, v2, v5
-; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX10-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX10-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fdiv_v2f16_arcp_ulp25:
+; GFX11-LABEL: v_neg_rsq_v2f16:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: v_cvt_f32_f16_e32 v7, v5
-; GFX11-NEXT: v_rcp_f32_e32 v4, v4
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_dual_mul_f32 v3, v6, v3 :: v_dual_mul_f32 v4, v7, v4
+; GFX11-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX11-NEXT: v_div_fixup_f16 v0, v3, v1, v0
-; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
+ %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %sqrt
ret <2 x half> %fdiv
}
-define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rcp_f32_e32 v2, v2
-; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v2, v1
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_rcp_f16_e32 v2, v1
-; GFX10-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX10-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT: v_pack_b32_f16 v0, v2, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_fdiv_v2f16_arcp_afn_ulp25:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_rcp_f16_e32 v1, v1
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: v_rcp_f16_e32 v2, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
-; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
- ret <2 x half> %fdiv
-}
+declare half @llvm.fabs.f16(half)
+declare half @llvm.sqrt.f16(half)
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
+declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
!0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index dabeda5d37fd9e..95290aca2324a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -265,6 +265,28 @@ define amdgpu_kernel void @div_afn_neg_k_x_pat_f16(ptr addrspace(1) %out) #0 {
ret void
}
+; GCN-LABEL: {{^}}v_fdiv_f16_arcp:
+; SI: v_rcp_f32
+; SI: v_mul_f32
+
+; GFX8PLUS: v_rcp_f32
+; GFX8PLUS: v_mul_f32
+define half @v_fdiv_f16_arcp(half %x, half %y) {
+ %fdiv = fdiv arcp half %x, %y
+ ret half %fdiv
+}
+
+; GCN-LABEL: {{^}}v_fdiv_f16_afn_nsz:
+; SI: v_rcp_f32
+; SI: v_mul_f32
+
+; GFX8PLUS: v_rcp_f16
+; GFX8PLUS: v_mul_f16
+define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
+ %fdiv = fdiv afn nsz half %x, %y
+ ret half %fdiv
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
declare half @llvm.sqrt.f16(half) #2
declare half @llvm.fabs.f16(half) #2
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.ll b/llvm/test/CodeGen/AMDGPU/fdiv.ll
index 73c759ee51a246..0937462373350f 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.ll
@@ -145,7 +145,27 @@ entry:
ret void
}
-; FUNC-LABEL: {{^}}fdiv_f32_arcp_math:
+; FUNC-LABEL: {{^}}fdiv_f32_arcp_daz:
+; GCN: v_div_scale_f32
+; GCN-DAG: v_rcp_f32
+; GCN-DAG: v_div_scale_f32
+; GCN: {{s_setreg_imm32_b32|s_denorm_mode}}
+; GCN: v_fma{{c?}}_f32
+; GCN: v_fma{{c?}}_f32
+; GCN: v_fma{{c?}}_f32
+; GCN: v_fma{{c?}}_f32
+; GCN: {{s_setreg_imm32_b32|s_denorm_mode}}
+; GCN: v_div_fmas_f32
+; GCN: v_div_fixup_f32
+; GCN-NOT: v_mul_f32
+define amdgpu_kernel void @fdiv_f32_arcp_daz(ptr addrspace(1) %out, float %a, float %b) #0 {
+entry:
+ %fdiv = fdiv arcp float %a, %b
+ store float %fdiv, ptr addrspace(1) %out
+ ret void
+}
+
+; FUNC-LABEL: {{^}}fdiv_f32_arcp_ninf:
; R600-DAG: RECIP_IEEE * T{{[0-9]+\.[XYZW]}}, KC0[2].W
; R600-DAG: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}, PS, KC0[2].Z,
@@ -153,7 +173,7 @@ entry:
; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
; GCN-NOT: [[RESULT]]
; GCN: buffer_store_{{dword|b32}} [[RESULT]]
-define amdgpu_kernel void @fdiv_f32_arcp_math(ptr addrspace(1) %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_arcp_ninf(ptr addrspace(1) %out, float %a, float %b) #0 {
entry:
%fdiv = fdiv arcp ninf float %a, %b
store float %fdiv, ptr addrspace(1) %out
More information about the llvm-commits
mailing list