[llvm] c9d0d15 - AMDGPU: Refine some rsq formation tests
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 16 10:42:09 PDT 2023
Author: Matt Arsenault
Date: 2023-08-16T13:37:03-04:00
New Revision: c9d0d15e69a9b3375740ff803f78a8a4ecfc0105
URL: https://github.com/llvm/llvm-project/commit/c9d0d15e69a9b3375740ff803f78a8a4ecfc0105
DIFF: https://github.com/llvm/llvm-project/commit/c9d0d15e69a9b3375740ff803f78a8a4ecfc0105.diff
LOG: AMDGPU: Refine some rsq formation tests
Drop unnecessary flags and metadata, add contract flags that should be
necessary.
Added:
Modified:
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
llvm/test/CodeGen/AMDGPU/rsq.f32.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
index cf1747ffe00f3b..b0699c9ebf4160 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
@@ -45,6 +45,49 @@ define amdgpu_cs float @sqrt_rcp(float inreg %arg1) {
ret float %b
}
+define amdgpu_cs float @div_sqrt_contract(float inreg %arg1) {
+; GCN-LABEL: div_sqrt_contract:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call contract float @llvm.sqrt.f32(float %arg1)
+ %b = fdiv afn contract float 1.000000e+00, %a
+ ret float %b
+}
+
+define amdgpu_cs float @sqrt_div_contract(float inreg %arg1) {
+; GCN-LABEL: sqrt_div_contract:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = fdiv afn contract float 1.000000e+00, %arg1
+ %b = call contract float @llvm.sqrt.f32(float %a)
+ ret float %b
+}
+
+define amdgpu_cs float @rcp_sqrt_contract(float inreg %arg1) {
+; GCN-LABEL: rcp_sqrt_contract:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call contract float @llvm.sqrt.f32(float %arg1)
+ %b = call contract float @llvm.amdgcn.rcp.f32(float %a)
+ ret float %b
+}
+
+define amdgpu_cs float @sqrt_rcp_contract(float inreg %arg1) {
+; GCN-LABEL: sqrt_rcp_contract:
+; GCN: ; %bb.0: ; %.entry
+; GCN-NEXT: v_rsq_f32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+.entry:
+ %a = call contract float @llvm.amdgcn.rcp.f32(float %arg1)
+ %b = call contract float @llvm.sqrt.f32(float %a)
+ ret float %b
+}
declare float @llvm.sqrt.f32(float)
declare float @llvm.amdgcn.rcp.f32(float)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
index e6b108a5585383..3e6dc1b76c23b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
@@ -1,13 +1,13 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
---
-name: rcp_sqrt_test
+name: rcp_sqrt_test_f32
body: |
bb.0:
liveins: $sgpr0
- ; GCN-LABEL: name: rcp_sqrt_test
+ ; GCN-LABEL: name: rcp_sqrt_test_f32
; GCN: liveins: $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
@@ -23,12 +23,33 @@ body: |
...
---
-name: sqrt_rcp_test
+name: contract_afn_rcp_contract_sqrt_test_f32
body: |
bb.0:
liveins: $sgpr0
- ; GCN-LABEL: name: sqrt_rcp_test
+ ; GCN-LABEL: name: contract_afn_rcp_contract_sqrt_test_f32
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %2:_(s32) = contract G_FSQRT %0:_
+ %3:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32)
+ $vgpr0 = COPY %3:_(s32)
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name: sqrt_rcp_test_f32
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GCN-LABEL: name: sqrt_rcp_test_f32
; GCN: liveins: $sgpr0
; GCN-NEXT: {{ $}}
; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
@@ -42,3 +63,103 @@ body: |
SI_RETURN_TO_EPILOG implicit $vgpr0
...
+
+---
+name: afn_rcp_afn_amdgcn_sqrt_test_f32
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GCN-LABEL: name: afn_rcp_afn_amdgcn_sqrt_test_f32
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[COPY]](s32)
+ ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[INT1]](s32)
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %0:_(s32)
+ %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:_(s32)
+ $vgpr0 = COPY %2
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name: afn_contract_rcp_afn_contract_amdgcn_sqrt_test_f32
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GCN-LABEL: name: afn_contract_rcp_afn_contract_amdgcn_sqrt_test_f32
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[COPY]](s32)
+ ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; GCN-NEXT: $vgpr0 = COPY [[INT1]](s32)
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s32) = afn contract G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %0:_(s32)
+ %2:_(s32) = afn contract G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:_(s32)
+ $vgpr0 = COPY %2
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name: rsq_test_f16
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GCN-LABEL: name: rsq_test_f16
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT [[TRUNC]]
+ ; GCN-NEXT: %one:_(s16) = contract G_FCONSTANT half 0xH3C00
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0
+ %sqrt:_(s16) = G_FSQRT %1:_
+ %one:_(s16) = contract G_FCONSTANT half 1.0
+ %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq
+ $vgpr0 = COPY %ext
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name: neg_rsq_test_f16
+body: |
+ bb.0:
+ liveins: $sgpr0
+
+ ; GCN-LABEL: name: neg_rsq_test_f16
+ ; GCN: liveins: $sgpr0
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+ ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+ ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT [[TRUNC]]
+ ; GCN-NEXT: %one:_(s16) = contract G_FCONSTANT half 0xHBC00
+ ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+ ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+ ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ %0:_(s32) = COPY $sgpr0
+ %1:_(s16) = G_TRUNC %0
+ %sqrt:_(s16) = G_FSQRT %1:_
+ %one:_(s16) = contract G_FCONSTANT half -1.0
+ %rsq:_(s16) = contract G_FDIV %one, %sqrt
+ %ext:_(s32) = G_ANYEXT %rsq
+ $vgpr0 = COPY %ext
+ SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 997d6ae87086c5..546af4d3453101 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -237,7 +237,7 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX11-NEXT: v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv half %a, %b, !fpmath !0
+ %fdiv = fdiv half %a, %b
ret half %fdiv
}
@@ -491,7 +491,7 @@ define half @v_rcp_f16_ulp25(half %x) {
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_rcp_f16_e32 v0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv half 1.0, %x, !fpmath !0
+ %fdiv = fdiv half 1.0, %x
ret half %fdiv
}
@@ -527,7 +527,7 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn half %a, %b, !fpmath !0
+ %fdiv = fdiv afn half %a, %b
ret half %fdiv
}
@@ -594,7 +594,7 @@ define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv arcp half %a, %b, !fpmath !0
+ %fdiv = fdiv arcp half %a, %b
ret half %fdiv
}
@@ -986,7 +986,7 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_div_fixup_f16 v1, v4, v2, v5
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv <2 x half> %a, %b, !fpmath !0
+ %fdiv = fdiv <2 x half> %a, %b
ret <2 x half> %fdiv
}
@@ -1981,7 +1981,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0
+ %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
}
@@ -2043,7 +2043,7 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
+ %fdiv = fdiv afn <2 x half> %a, %b
ret <2 x half> %fdiv
}
@@ -2161,7 +2161,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
+ %fdiv = fdiv arcp <2 x half> %a, %b
ret <2 x half> %fdiv
}
@@ -2223,7 +2223,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX11-NEXT: v_mul_f16_e32 v1, v3, v2
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
+ %fdiv = fdiv afn arcp <2 x half> %a, %b
ret <2 x half> %fdiv
}
@@ -2818,8 +2818,8 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%a = bitcast i16 %a.arg to half
- %sqrt = call half @llvm.sqrt.f16(half %a)
- %fdiv = fdiv half 1.0, %sqrt
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv contract half 1.0, %sqrt
%result = bitcast half %fdiv to i16
ret i16 %result
}
@@ -3008,8 +3008,8 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
%a = bitcast i32 %a.arg to <2 x half>
- %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
- %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %sqrt
+ %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
%result = bitcast <2 x half> %fdiv to i32
ret i32 %result
}
@@ -3073,8 +3073,8 @@ define half @v_rsq_f16(half %a) {
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %sqrt = call half @llvm.sqrt.f16(half %a)
- %fdiv = fdiv half 1.0, %sqrt
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv contract half 1.0, %sqrt
ret half %fdiv
}
@@ -3147,7 +3147,362 @@ define half @v_neg_rsq_f16(half %a) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv contract half -1.0, %sqrt
+ ret half %fdiv
+}
+
+define { half, half } @v_rsq_f16_multi_use(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16_multi_use:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16_multi_use:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_multi_use:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e32 v2, v0
+; GFX89-NEXT: v_rsq_f16_e32 v1, v0
+; GFX89-NEXT: v_mov_b32_e32 v0, v2
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rsq_f16_multi_use:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v2, v0
+; GFX10-NEXT: v_rsq_f16_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rsq_f16_multi_use:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v2, v0
+; GFX11-NEXT: v_rsq_f16_e32 v1, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mov_b32_e32 v0, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %insert.0 = insertvalue { half, half } poison, half %sqrt, 0
+ %fdiv = fdiv contract half 1.0, %sqrt
+ %insert.1 = insertvalue { half, half } %insert.0, half %fdiv, 1
+ ret { half, half } %insert.1
+}
+
+define half @v_rsq_f16_missing_contract0(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract0:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract0:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_missing_contract0:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rsq_f16_e32 v0, v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_rsq_f16_missing_contract0:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv contract half 1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_rsq_f16_missing_contract1(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract1:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract1:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_missing_contract1:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rsq_f16_e32 v0, v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_rsq_f16_missing_contract1:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv half 1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_neg_rsq_f16_missing_contract0(half %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv contract half -1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_neg_rsq_f16_missing_contract1(half %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
%fdiv = fdiv half -1.0, %sqrt
ret half %fdiv
}
@@ -3222,8 +3577,8 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
%a.fabs = call half @llvm.fabs.f16(half %a)
- %sqrt = call half @llvm.sqrt.f16(half %a.fabs)
- %fdiv = fdiv half -1.0, %sqrt
+ %sqrt = call contract half @llvm.sqrt.f16(half %a.fabs)
+ %fdiv = fdiv contract half -1.0, %sqrt
ret half %fdiv
}
@@ -3286,8 +3641,8 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
- %sqrt = call half @llvm.sqrt.f16(half %a)
- %fdiv = fdiv arcp half 1.0, %sqrt
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv contract arcp half 1.0, %sqrt
ret half %fdiv
}
@@ -3360,8 +3715,8 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %sqrt = call half @llvm.sqrt.f16(half %a)
- %fdiv = fdiv arcp half -1.0, %sqrt
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv contract arcp half -1.0, %sqrt
ret half %fdiv
}
@@ -3389,6 +3744,36 @@ define half @v_rsq_f16_afn(half %a) {
; GFX10PLUS: ; %bb.0:
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract half @llvm.sqrt.f16(half %a)
+ %fdiv = fdiv afn contract half 1.0, %sqrt
+ ret half %fdiv
+}
+
+define half @v_rsq_f16_afn_nocontract(half %a) {
+; GFX6-LABEL: v_rsq_f16_afn_nocontract:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_rcp_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_afn_nocontract:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rsq_f16_e32 v0, v0
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_rsq_f16_afn_nocontract:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT: v_rsq_f16_e32 v0, v0
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
%sqrt = call half @llvm.sqrt.f16(half %a)
%fdiv = fdiv afn half 1.0, %sqrt
@@ -3568,8 +3953,8 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
- %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %sqrt
+ %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt
ret <2 x half> %fdiv
}
@@ -3746,8 +4131,8 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
- %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
- %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %sqrt
+ %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
ret <2 x half> %fdiv
}
@@ -3756,7 +4141,6 @@ declare half @llvm.sqrt.f16(half)
declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
-!0 = !{float 2.500000e+00}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GFX10-FLUSH: {{.*}}
; GFX10-IEEE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 02e8bb070b6aaa..76c79ff055d27f 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -233,7 +233,7 @@ entry:
%gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
%gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
%b.val = load volatile half, ptr addrspace(1) %gep.b
- %r.val = fdiv half 1.0, %b.val, !fpmath !0
+ %r.val = fdiv half 1.0, %b.val
store half %r.val, ptr addrspace(1) %gep.r
ret void
}
@@ -328,12 +328,12 @@ entry:
%gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
%b.val = load volatile half, ptr addrspace(1) %gep.b
%b.abs = call half @llvm.fabs.f16(half %b.val)
- %r.val = fdiv half 1.0, %b.abs, !fpmath !0
+ %r.val = fdiv half 1.0, %b.abs
store half %r.val, ptr addrspace(1) %gep.r
ret void
}
-; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.
+; We could not do 1/b -> rcp_f32(b) under !fpmath < 1ulp.
define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
; SI-LABEL: reciprocal_f16_rounded:
@@ -505,7 +505,7 @@ entry:
%gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
%gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
%b.val = load volatile half, ptr addrspace(1) %gep.b
- %r.val = fdiv afn half 1.0, %b.val, !fpmath !0
+ %r.val = fdiv afn half 1.0, %b.val
store half %r.val, ptr addrspace(1) %gep.r
ret void
}
@@ -599,7 +599,7 @@ entry:
%gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
%gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
%b.val = load volatile half, ptr addrspace(1) %gep.b
- %r.val = fdiv half -1.0, %b.val, !fpmath !0
+ %r.val = fdiv half -1.0, %b.val
store half %r.val, ptr addrspace(1) %gep.r
ret void
}
@@ -694,8 +694,8 @@ entry:
%gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
%gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
%b.val = load volatile half, ptr addrspace(1) %gep.b
- %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
- %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
+ %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+ %r.val = fdiv contract half 1.0, %b.sqrt
store half %r.val, ptr addrspace(1) %gep.r
ret void
}
@@ -789,6 +789,209 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+ %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+ %b.val = load volatile half, ptr addrspace(1) %gep.b
+ %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+ %r.val = fdiv contract half -1.0, %b.sqrt
+ store half %r.val, ptr addrspace(1) %gep.r
+ ret void
+}
+
+define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_rsq_f16_multi_use:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v2
+; SI-NEXT: v_sqrt_f32_e32 v3, v3
+; SI-NEXT: v_div_scale_f32 v4, s[2:3], v3, v3, 1.0
+; SI-NEXT: v_rcp_f32_e32 v5, v4
+; SI-NEXT: v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v7, -v4, v5, 1.0
+; SI-NEXT: v_fma_f32 v5, v7, v5, v5
+; SI-NEXT: v_mul_f32_e32 v7, v6, v5
+; SI-NEXT: v_fma_f32 v8, -v4, v7, v6
+; SI-NEXT: v_fma_f32 v7, v8, v5, v7
+; SI-NEXT: v_fma_f32 v4, -v4, v7, v6
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; SI-NEXT: v_div_fixup_f32 v3, v4, v3, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_store_short v3, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX8-LABEL: v_rsq_f16_multi_use:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v3, v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_rsq_f16_e32 v4, v3
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: flat_store_short v[0:1], v4
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_rsq_f16_multi_use:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_rsq_f16_e32 v2, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: global_store_short v0, v2, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_rsq_f16_multi_use:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_rsq_f16_e32 v2, v1
+; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_store_short v0, v2, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_rsq_f16_multi_use:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_rsq_f16_e32 v2, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+ %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+ %b.val = load volatile half, ptr addrspace(1) %gep.b
+ store volatile half %b.val, ptr addrspace(1) %gep.r
+ %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+ %r.val = fdiv contract half 1.0, %b.sqrt
+ store half %r.val, ptr addrspace(1) %gep.r
+ ret void
+}
+
+define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_rsq_f16_missing_contract0:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_sqrt_f32_e32 v2, v2
+; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
+; SI-NEXT: v_rcp_f32_e32 v4, v3
+; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; SI-NEXT: v_fma_f32 v4, v6, v4, v4
+; SI-NEXT: v_mul_f32_e32 v6, v5, v4
+; SI-NEXT: v_fma_f32 v7, -v3, v6, v5
+; SI-NEXT: v_fma_f32 v6, v7, v4, v6
+; SI-NEXT: v_fma_f32 v3, -v3, v6, v5
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX8-LABEL: v_rsq_f16_missing_contract0:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_rsq_f16_e32 v3, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_rsq_f16_missing_contract0:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_rsq_f16_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_rsq_f16_missing_contract0:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_rsq_f16_e32 v1, v1
+; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_rsq_f16_missing_contract0:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_rsq_f16_e32 v1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
@@ -796,7 +999,204 @@ entry:
%gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
%b.val = load volatile half, ptr addrspace(1) %gep.b
%b.sqrt = call half @llvm.sqrt.f16(half %b.val)
- %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
+ %r.val = fdiv contract half 1.0, %b.sqrt
+ store half %r.val, ptr addrspace(1) %gep.r
+ ret void
+}
+
+define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_rsq_f16_missing_contract1:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_sqrt_f32_e32 v2, v2
+; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
+; SI-NEXT: v_rcp_f32_e32 v4, v3
+; SI-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; SI-NEXT: v_fma_f32 v4, v6, v4, v4
+; SI-NEXT: v_mul_f32_e32 v6, v5, v4
+; SI-NEXT: v_fma_f32 v7, -v3, v6, v5
+; SI-NEXT: v_fma_f32 v6, v7, v4, v6
+; SI-NEXT: v_fma_f32 v3, -v3, v6, v5
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; SI-NEXT: v_div_fixup_f32 v2, v3, v2, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX8-LABEL: v_rsq_f16_missing_contract1:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_rsq_f16_e32 v3, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_rsq_f16_missing_contract1:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_rsq_f16_e32 v1, v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_rsq_f16_missing_contract1:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_rsq_f16_e32 v1, v1
+; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_rsq_f16_missing_contract1:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_rsq_f16_e32 v1, v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+ %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+ %b.val = load volatile half, ptr addrspace(1) %gep.b
+ %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+ %r.val = fdiv half 1.0, %b.sqrt
+ store half %r.val, ptr addrspace(1) %gep.r
+ ret void
+}
+
+define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_neg_rsq_f16_missing_contract1:
+; SI: ; %bb.0: ; %entry
+; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT: v_mov_b32_e32 v1, 0
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_sqrt_f32_e32 v2, v2
+; SI-NEXT: v_div_scale_f32 v3, s[2:3], v2, v2, -1.0
+; SI-NEXT: v_rcp_f32_e32 v4, v3
+; SI-NEXT: v_div_scale_f32 v5, vcc, -1.0, v2, -1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; SI-NEXT: v_fma_f32 v4, v6, v4, v4
+; SI-NEXT: v_mul_f32_e32 v6, v5, v4
+; SI-NEXT: v_fma_f32 v7, -v3, v6, v5
+; SI-NEXT: v_fma_f32 v6, v7, v4, v6
+; SI-NEXT: v_fma_f32 v3, -v3, v6, v5
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; SI-NEXT: v_div_fixup_f32 v2, v3, v2, -1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT: s_endpgm
+;
+; GFX8-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX8: ; %bb.0: ; %entry
+; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_waitcnt lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s3
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_rcp_f16_e64 v3, -v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_store_short v[0:1], v3
+; GFX8-NEXT: s_endpgm
+;
+; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX9: ; %bb.0: ; %entry
+; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX9-NEXT: global_store_short v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX10: ; %bb.0: ; %entry
+; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX10-NEXT: global_store_short v0, v1, s[0:1]
+; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX11-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT: s_nop 0
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+entry:
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %tid.ext = sext i32 %tid to i64
+ %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+ %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+ %b.val = load volatile half, ptr addrspace(1) %gep.b
+ %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+ %r.val = fdiv half -1.0, %b.sqrt
store half %r.val, ptr addrspace(1) %gep.r
ret void
}
@@ -1324,12 +1724,408 @@ define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
ret half %fdiv
}
+define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
+; GFX6-IEEE-LABEL: v_rsq_v2f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_v2f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: v_rsq_v2f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_sqrt_f32_e32 v0, v0
+; SI-NEXT: v_sqrt_f32_e32 v1, v1
+; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; SI-NEXT: v_rcp_f32_e32 v3, v2
+; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; SI-NEXT: v_fma_f32 v3, v5, v3, v3
+; SI-NEXT: v_mul_f32_e32 v5, v4, v3
+; SI-NEXT: v_fma_f32 v6, -v2, v5, v4
+; SI-NEXT: v_fma_f32 v5, v6, v3, v5
+; SI-NEXT: v_fma_f32 v2, -v2, v5, v4
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; SI-NEXT: v_rcp_f32_e32 v2, v3
+; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v5, -v3, v2, 1.0
+; SI-NEXT: v_fma_f32 v2, v5, v2, v2
+; SI-NEXT: v_mul_f32_e32 v5, v4, v2
+; SI-NEXT: v_fma_f32 v6, -v3, v5, v4
+; SI-NEXT: v_fma_f32 v5, v6, v2, v5
+; SI-NEXT: v_fma_f32 v3, -v3, v5, v4
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5
+; SI-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rsq_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_rsq_f16_e32 v0, v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rsq_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_rsq_f16_e32 v0, v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rsq_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_rsq_f16_e32 v0, v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rsq_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_rsq_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_rsq_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_rsq_v2f16:
+; GFX9-IEEE: ; %bb.0:
+; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+; GFX9-FLUSH-LABEL: v_rsq_v2f16:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-LABEL: v_neg_rsq_v2f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_sqrt_f32_e32 v0, v0
+; SI-NEXT: v_sqrt_f32_e32 v1, v1
+; SI-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; SI-NEXT: v_rcp_f32_e32 v3, v2
+; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; SI-NEXT: v_fma_f32 v3, v5, v3, v3
+; SI-NEXT: v_mul_f32_e32 v5, v4, v3
+; SI-NEXT: v_fma_f32 v6, -v2, v5, v4
+; SI-NEXT: v_fma_f32 v5, v6, v3, v5
+; SI-NEXT: v_fma_f32 v2, -v2, v5, v4
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; SI-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, -1.0
+; SI-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; SI-NEXT: v_rcp_f32_e32 v2, v3
+; SI-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT: v_fma_f32 v5, -v3, v2, 1.0
+; SI-NEXT: v_fma_f32 v2, v5, v2, v2
+; SI-NEXT: v_mul_f32_e32 v5, v4, v2
+; SI-NEXT: v_fma_f32 v6, -v3, v5, v4
+; SI-NEXT: v_fma_f32 v5, v6, v2, v5
+; SI-NEXT: v_fma_f32 v3, -v3, v5, v4
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT: v_div_fmas_f32 v2, v3, v2, v5
+; SI-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_neg_rsq_v2f16:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT: v_rcp_f16_sdwa v1, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX8-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_neg_rsq_v2f16:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX9-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX9-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX9-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_v2f16:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX10-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_v2f16:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT: v_rcp_f16_e64 v1, -v1
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX9-IEEE: ; %bb.0:
+; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX9-IEEE-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-IEEE-NEXT: s_setpc_b64 s[30:31]
+; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX9-FLUSH: ; %bb.0:
+; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
+; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX9-FLUSH-NEXT: v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+ %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
+ ret <2 x half> %fdiv
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #2
declare half @llvm.sqrt.f16(half) #2
declare half @llvm.fabs.f16(half) #2
+declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) #2
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind "unsafe-fp-math"="true" }
-
-!0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 97fbdd59f71ee8..ac3596f532293f 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -849,6 +849,81 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
ret float %fdiv
}
+define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
+; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; CODEGEN-IEEE-SDAG: ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT: s_mov_b32 s4, 0x800000
+; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v1, 0x4b800000
+; CODEGEN-IEEE-SDAG-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT: v_mov_b32_e32 v1, 0x45800000
+; CODEGEN-IEEE-SDAG-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; CODEGEN-IEEE-GISEL: ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x800000
+; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v2, 0x4b800000
+; CODEGEN-IEEE-GISEL-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT: v_mov_b32_e32 v1, 0x45800000
+; CODEGEN-IEEE-GISEL-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; IR-IEEE-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; IR-IEEE: ; %bb.0:
+; IR-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; IR-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; IR-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; IR-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; IR-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; IR-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; IR-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; IR-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; CODEGEN-DAZ: ; %bb.0:
+; CODEGEN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31]
+;
+; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; IR-DAZ: ; %bb.0:
+; IR-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; IR-DAZ-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-NEXT: v_rcp_f32_e32 v2, v1
+; IR-DAZ-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-NEXT: v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-NEXT: v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-NEXT: v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-NEXT: v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-NEXT: v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0
+ %fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0
+ ret float %fdiv
+}
+
define float @v_recip_sqrt_f32_afn_ulp25(float %x) {
; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25:
; CHECK: ; %bb.0:
@@ -860,6 +935,17 @@ define float @v_recip_sqrt_f32_afn_ulp25(float %x) {
ret float %fdiv
}
+define float @v_recip_sqrt_f32_afn_ulp25_contract(float %x) {
+; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_rsq_f32_e32 v0, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract afn float @llvm.sqrt.f32(float %x), !fpmath !0
+ %fdiv = fdiv contract afn float 1.0, %sqrt, !fpmath !0
+ ret float %fdiv
+}
+
declare float @llvm.sqrt.f32(float)
!0 = !{float 2.500000e+00}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 0f2eb38f44cd8d..6af67b1ddafac3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -4,6 +4,7 @@ declare float @llvm.amdgcn.rcp.f32(float) #0
declare double @llvm.amdgcn.rcp.f64(double) #0
declare double @llvm.amdgcn.sqrt.f64(double) #0
+declare float @llvm.amdgcn.sqrt.f32(float) #0
declare double @llvm.sqrt.f64(double) #0
declare float @llvm.sqrt.f32(float) #0
@@ -66,8 +67,28 @@ define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(ptr addrspace(1) %ou
; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
; SI: v_rsq_f32_e32
define amdgpu_kernel void @safe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #1 {
- %sqrt = call float @llvm.sqrt.f32(float %src)
- %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
+ %sqrt = call contract float @llvm.sqrt.f32(float %src)
+ %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt)
+ store float %rcp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_amdgcn_sqrt_f32:
+; SI: v_sqrt_f32_e32
+; SI: v_rcp_f32_e32
+define amdgpu_kernel void @safe_rsq_rcp_pat_amdgcn_sqrt_f32(ptr addrspace(1) %out, float %src) #1 {
+ %sqrt = call contract float @llvm.amdgcn.sqrt.f32(float %src)
+ %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt)
+ store float %rcp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_amdgcn_sqrt_f32_nocontract:
+; SI: v_sqrt_f32_e32
+; SI: v_rcp_f32_e32
+define amdgpu_kernel void @safe_rsq_rcp_pat_amdgcn_sqrt_f32_nocontract(ptr addrspace(1) %out, float %src) #1 {
+ %sqrt = call float @llvm.amdgcn.sqrt.f32(float %src)
+ %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt)
store float %rcp, ptr addrspace(1) %out, align 4
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
index b31af6f8691f48..31c4252303b042 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
@@ -1167,6 +1167,54 @@ define float @v_rsq_f32(float %val) {
ret float %div
}
+define { float, float } @v_rsq_f32_multi_use(float %val) {
+; GCN-DAZ-LABEL: v_rsq_f32_multi_use:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v2, v0
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v0
+; GCN-DAZ-NEXT: v_mov_b32_e32 v0, v2
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use:
+; GCN-IEEE-UNSAFE: ; %bb.0:
+; GCN-IEEE-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-UNSAFE-NEXT: v_sqrt_f32_e32 v2, v0
+; GCN-IEEE-UNSAFE-NEXT: v_rsq_f32_e32 v1, v0
+; GCN-IEEE-UNSAFE-NEXT: v_mov_b32_e32 v0, v2
+; GCN-IEEE-UNSAFE-NEXT: s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
+; SI-IEEE-SAFE: ; %bb.0:
+; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
+; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2
+; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
+; CI-IEEE-SAFE: ; %bb.0:
+; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v2, vcc, 0, v2
+; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v1, v2
+; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
+ %insert.0 = insertvalue { float, float } poison, float %sqrt, 0
+ %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+ %insert.1 = insertvalue { float, float } %insert.0, float %div, 1
+ ret { float, float } %insert.1
+}
+
define float @v_rsq_f32_missing_contract0(float %val) {
; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0:
; GCN-DAZ: ; %bb.0:
More information about the llvm-commits
mailing list