[llvm] c9d0d15 - AMDGPU: Refine some rsq formation tests

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 16 10:42:09 PDT 2023


Author: Matt Arsenault
Date: 2023-08-16T13:37:03-04:00
New Revision: c9d0d15e69a9b3375740ff803f78a8a4ecfc0105

URL: https://github.com/llvm/llvm-project/commit/c9d0d15e69a9b3375740ff803f78a8a4ecfc0105
DIFF: https://github.com/llvm/llvm-project/commit/c9d0d15e69a9b3375740ff803f78a8a4ecfc0105.diff

LOG: AMDGPU: Refine some rsq formation tests

Drop unnecessary flags and metadata, add contract flags that should be
necessary.

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
    llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
    llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
    llvm/test/CodeGen/AMDGPU/rsq.f32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
index cf1747ffe00f3b..b0699c9ebf4160 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.ll
@@ -45,6 +45,49 @@ define amdgpu_cs float @sqrt_rcp(float inreg %arg1) {
   ret float %b
 }
 
+define amdgpu_cs float @div_sqrt_contract(float inreg %arg1) {
+; GCN-LABEL: div_sqrt_contract:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = call contract float @llvm.sqrt.f32(float %arg1)
+  %b = fdiv afn contract float 1.000000e+00, %a
+  ret float %b
+}
+
+define amdgpu_cs float @sqrt_div_contract(float inreg %arg1) {
+; GCN-LABEL: sqrt_div_contract:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = fdiv afn contract float 1.000000e+00, %arg1
+  %b = call contract float @llvm.sqrt.f32(float %a)
+  ret float %b
+}
+
+define amdgpu_cs float @rcp_sqrt_contract(float inreg %arg1) {
+; GCN-LABEL: rcp_sqrt_contract:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = call contract float @llvm.sqrt.f32(float %arg1)
+  %b = call contract float @llvm.amdgcn.rcp.f32(float %a)
+  ret float %b
+}
+
+define amdgpu_cs float @sqrt_rcp_contract(float inreg %arg1) {
+; GCN-LABEL: sqrt_rcp_contract:
+; GCN:       ; %bb.0: ; %.entry
+; GCN-NEXT:    v_rsq_f32_e32 v0, s0
+; GCN-NEXT:    ; return to shader part epilog
+.entry:
+  %a = call contract float @llvm.amdgcn.rcp.f32(float %arg1)
+  %b = call contract float @llvm.sqrt.f32(float %a)
+  ret float %b
+}
 
 declare float @llvm.sqrt.f32(float)
 declare float @llvm.amdgcn.rcp.f32(float)

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
index e6b108a5585383..3e6dc1b76c23b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rsq.mir
@@ -1,13 +1,13 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
 
 ---
-name:            rcp_sqrt_test
+name:            rcp_sqrt_test_f32
 body:             |
   bb.0:
     liveins: $sgpr0
 
-    ; GCN-LABEL: name: rcp_sqrt_test
+    ; GCN-LABEL: name: rcp_sqrt_test_f32
     ; GCN: liveins: $sgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
@@ -23,12 +23,33 @@ body:             |
 ...
 
 ---
-name:            sqrt_rcp_test
+name:            contract_afn_rcp_contract_sqrt_test_f32
 body:             |
   bb.0:
     liveins: $sgpr0
 
-    ; GCN-LABEL: name: sqrt_rcp_test
+    ; GCN-LABEL: name: contract_afn_rcp_contract_sqrt_test_f32
+    ; GCN: liveins: $sgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), [[COPY]](s32)
+    ; GCN-NEXT: $vgpr0 = COPY [[INT]](s32)
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    %0:_(s32) = COPY $sgpr0
+    %2:_(s32) = contract G_FSQRT %0:_
+    %3:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %2:_(s32)
+    $vgpr0 = COPY %3:_(s32)
+    SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name:            sqrt_rcp_test_f32
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: sqrt_rcp_test_f32
     ; GCN: liveins: $sgpr0
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
@@ -42,3 +63,103 @@ body:             |
     SI_RETURN_TO_EPILOG implicit $vgpr0
 
 ...
+
+---
+name:            afn_rcp_afn_amdgcn_sqrt_test_f32
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: afn_rcp_afn_amdgcn_sqrt_test_f32
+    ; GCN: liveins: $sgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[COPY]](s32)
+    ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+    ; GCN-NEXT: $vgpr0 = COPY [[INT1]](s32)
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %0:_(s32)
+    %2:_(s32) = afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:_(s32)
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name:            afn_contract_rcp_afn_contract_amdgcn_sqrt_test_f32
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: afn_contract_rcp_afn_contract_amdgcn_sqrt_test_f32
+    ; GCN: liveins: $sgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), [[COPY]](s32)
+    ; GCN-NEXT: [[INT1:%[0-9]+]]:_(s32) = contract afn G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+    ; GCN-NEXT: $vgpr0 = COPY [[INT1]](s32)
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = afn contract G_INTRINSIC intrinsic(@llvm.amdgcn.sqrt), %0:_(s32)
+    %2:_(s32) = afn contract G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), %1:_(s32)
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name:            rsq_test_f16
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: rsq_test_f16
+    ; GCN: liveins: $sgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT [[TRUNC]]
+    ; GCN-NEXT: %one:_(s16) = contract G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s16) = G_TRUNC %0
+    %sqrt:_(s16) = G_FSQRT %1:_
+    %one:_(s16) = contract G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq
+    $vgpr0 = COPY %ext
+    SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...
+
+---
+name:            neg_rsq_test_f16
+body:             |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GCN-LABEL: name: neg_rsq_test_f16
+    ; GCN: liveins: $sgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; GCN-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT [[TRUNC]]
+    ; GCN-NEXT: %one:_(s16) = contract G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s16) = G_TRUNC %0
+    %sqrt:_(s16) = G_FSQRT %1:_
+    %one:_(s16) = contract G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq
+    $vgpr0 = COPY %ext
+    SI_RETURN_TO_EPILOG implicit $vgpr0
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 997d6ae87086c5..546af4d3453101 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -237,7 +237,7 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
 ; GFX11-NEXT:    v_fma_mixlo_f16 v2, v0, v2, 0 op_sel_hi:[1,0,0]
 ; GFX11-NEXT:    v_div_fixup_f16 v0, v2, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv half %a, %b, !fpmath !0
+  %fdiv = fdiv half %a, %b
   ret half %fdiv
 }
 
@@ -491,7 +491,7 @@ define half @v_rcp_f16_ulp25(half %x) {
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rcp_f16_e32 v0, v0
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv half 1.0, %x, !fpmath !0
+  %fdiv = fdiv half 1.0, %x
   ret half %fdiv
 }
 
@@ -527,7 +527,7 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv afn half %a, %b, !fpmath !0
+  %fdiv = fdiv afn half %a, %b
   ret half %fdiv
 }
 
@@ -594,7 +594,7 @@ define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv arcp half %a, %b, !fpmath !0
+  %fdiv = fdiv arcp half %a, %b
   ret half %fdiv
 }
 
@@ -986,7 +986,7 @@ define <2 x half> @v_fdiv_v2f16_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v4, v2, v5
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv <2 x half> %a, %b, !fpmath !0
+  %fdiv = fdiv <2 x half> %a, %b
   ret <2 x half> %fdiv
 }
 
@@ -1981,7 +1981,7 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x, !fpmath !0
+  %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
   ret <2 x half> %fdiv
 }
 
@@ -2043,7 +2043,7 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
+  %fdiv = fdiv afn <2 x half> %a, %b
   ret <2 x half> %fdiv
 }
 
@@ -2161,7 +2161,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
+  %fdiv = fdiv arcp <2 x half> %a, %b
   ret <2 x half> %fdiv
 }
 
@@ -2223,7 +2223,7 @@ define <2 x half> @v_fdiv_v2f16_arcp_afn_ulp25(<2 x half> %a, <2 x half> %b) {
 ; GFX11-NEXT:    v_mul_f16_e32 v1, v3, v2
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %fdiv = fdiv afn arcp <2 x half> %a, %b, !fpmath !0
+  %fdiv = fdiv afn arcp <2 x half> %a, %b
   ret <2 x half> %fdiv
 }
 
@@ -2818,8 +2818,8 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %a = bitcast i16 %a.arg to half
-  %sqrt = call half @llvm.sqrt.f16(half %a)
-  %fdiv = fdiv half 1.0, %sqrt
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv contract half 1.0, %sqrt
   %result = bitcast half %fdiv to i16
   ret i16 %result
 }
@@ -3008,8 +3008,8 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %a = bitcast i32 %a.arg to <2 x half>
-  %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
-  %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %sqrt
+  %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+  %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
   %result = bitcast <2 x half> %fdiv to i32
   ret i32 %result
 }
@@ -3073,8 +3073,8 @@ define half @v_rsq_f16(half %a) {
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %sqrt = call half @llvm.sqrt.f16(half %a)
-  %fdiv = fdiv half 1.0, %sqrt
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv contract half 1.0, %sqrt
   ret half %fdiv
 }
 
@@ -3147,7 +3147,362 @@ define half @v_neg_rsq_f16(half %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv contract half -1.0, %sqrt
+  ret half %fdiv
+}
+
+define { half, half } @v_rsq_f16_multi_use(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16_multi_use:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16_multi_use:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_multi_use:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_sqrt_f16_e32 v2, v0
+; GFX89-NEXT:    v_rsq_f16_e32 v1, v0
+; GFX89-NEXT:    v_mov_b32_e32 v0, v2
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rsq_f16_multi_use:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_e32 v2, v0
+; GFX10-NEXT:    v_rsq_f16_e32 v1, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, v2
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rsq_f16_multi_use:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sqrt_f16_e32 v2, v0
+; GFX11-NEXT:    v_rsq_f16_e32 v1, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mov_b32_e32 v0, v2
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %insert.0 = insertvalue { half, half } poison, half %sqrt, 0
+  %fdiv = fdiv contract half 1.0, %sqrt
+  %insert.1 = insertvalue { half, half } %insert.0, half %fdiv, 1
+  ret { half, half } %insert.1
+}
+
+define half @v_rsq_f16_missing_contract0(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract0:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract0:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_missing_contract0:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_rsq_f16_missing_contract0:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv contract half 1.0, %sqrt
+  ret half %fdiv
+}
+
+define half @v_rsq_f16_missing_contract1(half %a) {
+; GFX6-IEEE-LABEL: v_rsq_f16_missing_contract1:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_f16_missing_contract1:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_missing_contract1:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_rsq_f16_missing_contract1:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv half 1.0, %sqrt
+  ret half %fdiv
+}
+
+define half @v_neg_rsq_f16_missing_contract0(half %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_f16_missing_contract0:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv contract half -1.0, %sqrt
+  ret half %fdiv
+}
+
+define half @v_neg_rsq_f16_missing_contract1(half %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX89-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv half -1.0, %sqrt
   ret half %fdiv
 }
@@ -3222,8 +3577,8 @@ define half @v_neg_rsq_f16_fabs(half %a) {
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call half @llvm.fabs.f16(half %a)
-  %sqrt = call half @llvm.sqrt.f16(half %a.fabs)
-  %fdiv = fdiv half -1.0, %sqrt
+  %sqrt = call contract half @llvm.sqrt.f16(half %a.fabs)
+  %fdiv = fdiv contract half -1.0, %sqrt
   ret half %fdiv
 }
 
@@ -3286,8 +3641,8 @@ define half @v_rsq_f16_arcp(half %a) {
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
-  %sqrt = call half @llvm.sqrt.f16(half %a)
-  %fdiv = fdiv arcp half 1.0, %sqrt
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv contract arcp half 1.0, %sqrt
   ret half %fdiv
 }
 
@@ -3360,8 +3715,8 @@ define half @v_neg_rsq_f16_arcp(half %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %sqrt = call half @llvm.sqrt.f16(half %a)
-  %fdiv = fdiv arcp half -1.0, %sqrt
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv contract arcp half -1.0, %sqrt
   ret half %fdiv
 }
 
@@ -3389,6 +3744,36 @@ define half @v_rsq_f16_afn(half %a) {
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract half @llvm.sqrt.f16(half %a)
+  %fdiv = fdiv afn contract half 1.0, %sqrt
+  ret half %fdiv
+}
+
+define half @v_rsq_f16_afn_nocontract(half %a) {
+; GFX6-LABEL: v_rsq_f16_afn_nocontract:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_rsq_f16_afn_nocontract:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10PLUS-LABEL: v_rsq_f16_afn_nocontract:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT:    v_rsq_f16_e32 v0, v0
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call half @llvm.sqrt.f16(half %a)
   %fdiv = fdiv afn half 1.0, %sqrt
@@ -3568,8 +3953,8 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, 1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
-  %fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %sqrt
+  %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+  %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt
   ret <2 x half> %fdiv
 }
 
@@ -3746,8 +4131,8 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
 ; GFX11-NEXT:    v_div_fixup_f16 v1, v3, v1, -1.0
 ; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %sqrt = call <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
-  %fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %sqrt
+  %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+  %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
   ret <2 x half> %fdiv
 }
 
@@ -3756,7 +4141,6 @@ declare half @llvm.sqrt.f16(half)
 declare <2 x half> @llvm.fabs.v2f16(<2 x half>)
 declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
 
-!0 = !{float 2.500000e+00}
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX10-FLUSH: {{.*}}
 ; GFX10-IEEE: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 02e8bb070b6aaa..76c79ff055d27f 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -233,7 +233,7 @@ entry:
   %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
   %b.val = load volatile half, ptr addrspace(1) %gep.b
-  %r.val = fdiv half 1.0, %b.val, !fpmath !0
+  %r.val = fdiv half 1.0, %b.val
   store half %r.val, ptr addrspace(1) %gep.r
   ret void
 }
@@ -328,12 +328,12 @@ entry:
   %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
   %b.val = load volatile half, ptr addrspace(1) %gep.b
   %b.abs = call half @llvm.fabs.f16(half %b.val)
-  %r.val = fdiv half 1.0, %b.abs, !fpmath !0
+  %r.val = fdiv half 1.0, %b.abs
   store half %r.val, ptr addrspace(1) %gep.r
   ret void
 }
 
-; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp.
+; We could not do 1/b -> rcp_f32(b) under !fpmath < 1ulp.
 
 define amdgpu_kernel void @reciprocal_f16_rounded(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
 ; SI-LABEL: reciprocal_f16_rounded:
@@ -505,7 +505,7 @@ entry:
   %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
   %b.val = load volatile half, ptr addrspace(1) %gep.b
-  %r.val = fdiv afn half 1.0, %b.val, !fpmath !0
+  %r.val = fdiv afn half 1.0, %b.val
   store half %r.val, ptr addrspace(1) %gep.r
   ret void
 }
@@ -599,7 +599,7 @@ entry:
   %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
   %b.val = load volatile half, ptr addrspace(1) %gep.b
-  %r.val = fdiv half -1.0, %b.val, !fpmath !0
+  %r.val = fdiv half -1.0, %b.val
   store half %r.val, ptr addrspace(1) %gep.r
   ret void
 }
@@ -694,8 +694,8 @@ entry:
   %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
   %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
   %b.val = load volatile half, ptr addrspace(1) %gep.b
-  %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
-  %r.val = fdiv half 1.0, %b.sqrt, !fpmath !0
+  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+  %r.val = fdiv contract half 1.0, %b.sqrt
   store half %r.val, ptr addrspace(1) %gep.r
   ret void
 }
@@ -789,6 +789,209 @@ define amdgpu_kernel void @v_rsq_f16_neg(ptr addrspace(1) %r, ptr addrspace(1) %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+  %b.val = load volatile half, ptr addrspace(1) %gep.b
+  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+  %r.val = fdiv contract half -1.0, %b.sqrt
+  store half %r.val, ptr addrspace(1) %gep.r
+  ret void
+}
+
+define amdgpu_kernel void @v_rsq_f16_multi_use(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_rsq_f16_multi_use:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v2
+; SI-NEXT:    v_sqrt_f32_e32 v3, v3
+; SI-NEXT:    v_div_scale_f32 v4, s[2:3], v3, v3, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v5, v4
+; SI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v7, -v4, v5, 1.0
+; SI-NEXT:    v_fma_f32 v5, v7, v5, v5
+; SI-NEXT:    v_mul_f32_e32 v7, v6, v5
+; SI-NEXT:    v_fma_f32 v8, -v4, v7, v6
+; SI-NEXT:    v_fma_f32 v7, v8, v5, v7
+; SI-NEXT:    v_fma_f32 v4, -v4, v7, v6
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v4, v4, v5, v7
+; SI-NEXT:    v_div_fixup_f32 v3, v4, v3, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_short v3, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; GFX8-LABEL: v_rsq_f16_multi_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v3, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_rsq_f16_e32 v4, v3
+; GFX8-NEXT:    flat_store_short v[0:1], v3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_short v[0:1], v4
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_rsq_f16_multi_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_rsq_f16_e32 v2, v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_rsq_f16_multi_use:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_rsq_f16_e32 v2, v1
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_store_short v0, v2, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_rsq_f16_multi_use:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_rsq_f16_e32 v2, v1
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1] dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_store_b16 v0, v2, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+  %b.val = load volatile half, ptr addrspace(1) %gep.b
+  store volatile half %b.val, ptr addrspace(1) %gep.r
+  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+  %r.val = fdiv contract half 1.0, %b.sqrt
+  store half %r.val, ptr addrspace(1) %gep.r
+  ret void
+}
+
+define amdgpu_kernel void @v_rsq_f16_missing_contract0(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_rsq_f16_missing_contract0:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_sqrt_f32_e32 v2, v2
+; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v4, v3
+; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
+; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
+; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
+; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; GFX8-LABEL: v_rsq_f16_missing_contract0:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_rsq_f16_e32 v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_store_short v[0:1], v3
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_rsq_f16_missing_contract0:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_rsq_f16_missing_contract0:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_rsq_f16_missing_contract0:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -796,7 +999,204 @@ entry:
   %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
   %b.val = load volatile half, ptr addrspace(1) %gep.b
   %b.sqrt = call half @llvm.sqrt.f16(half %b.val)
-  %r.val = fdiv half -1.0, %b.sqrt, !fpmath !0
+  %r.val = fdiv contract half 1.0, %b.sqrt
+  store half %r.val, ptr addrspace(1) %gep.r
+  ret void
+}
+
+define amdgpu_kernel void @v_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_rsq_f16_missing_contract1:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_sqrt_f32_e32 v2, v2
+; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v4, v3
+; SI-NEXT:    v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
+; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
+; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
+; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; GFX8-LABEL: v_rsq_f16_missing_contract1:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_rsq_f16_e32 v3, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_store_short v[0:1], v3
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_rsq_f16_missing_contract1:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_rsq_f16_e32 v1, v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_rsq_f16_missing_contract1:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_rsq_f16_e32 v1, v1
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_rsq_f16_missing_contract1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+  %b.val = load volatile half, ptr addrspace(1) %gep.b
+  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+  %r.val = fdiv half 1.0, %b.sqrt
+  store half %r.val, ptr addrspace(1) %gep.r
+  ret void
+}
+
+define amdgpu_kernel void @v_neg_rsq_f16_missing_contract1(ptr addrspace(1) %r, ptr addrspace(1) %b) #0 {
+; SI-LABEL: v_neg_rsq_f16_missing_contract1:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_sqrt_f32_e32 v2, v2
+; SI-NEXT:    v_div_scale_f32 v3, s[2:3], v2, v2, -1.0
+; SI-NEXT:    v_rcp_f32_e32 v4, v3
+; SI-NEXT:    v_div_scale_f32 v5, vcc, -1.0, v2, -1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; SI-NEXT:    v_fma_f32 v4, v6, v4, v4
+; SI-NEXT:    v_mul_f32_e32 v6, v5, v4
+; SI-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; SI-NEXT:    v_fma_f32 v6, v7, v4, v6
+; SI-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; SI-NEXT:    v_div_fixup_f32 v2, v3, v2, -1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    buffer_store_short v2, v[0:1], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; GFX8-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT:    v_rcp_f16_e64 v3, -v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_store_short v[0:1], v3
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3] glc dlc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
+; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_neg_rsq_f16_missing_contract1:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3] glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_rcp_f16_e64 v1, -v1
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %gep.b = getelementptr inbounds half, ptr addrspace(1) %b, i64 %tid.ext
+  %gep.r = getelementptr inbounds half, ptr addrspace(1) %r, i64 %tid.ext
+  %b.val = load volatile half, ptr addrspace(1) %gep.b
+  %b.sqrt = call contract half @llvm.sqrt.f16(half %b.val)
+  %r.val = fdiv half -1.0, %b.sqrt
   store half %r.val, ptr addrspace(1) %gep.r
   ret void
 }
@@ -1324,12 +1724,408 @@ define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
   ret half %fdiv
 }
 
+define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
+; GFX6-IEEE-LABEL: v_rsq_v2f16:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rsq_v2f16:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: v_rsq_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_sqrt_f32_e32 v0, v0
+; SI-NEXT:    v_sqrt_f32_e32 v1, v1
+; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v3, v2
+; SI-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; SI-NEXT:    v_fma_f32 v3, v5, v3, v3
+; SI-NEXT:    v_mul_f32_e32 v5, v4, v3
+; SI-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; SI-NEXT:    v_fma_f32 v5, v6, v3, v5
+; SI-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; SI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; SI-NEXT:    v_div_fixup_f32 v0, v2, v0, 1.0
+; SI-NEXT:    v_rcp_f32_e32 v2, v3
+; SI-NEXT:    v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v5, -v3, v2, 1.0
+; SI-NEXT:    v_fma_f32 v2, v5, v2, v2
+; SI-NEXT:    v_mul_f32_e32 v5, v4, v2
+; SI-NEXT:    v_fma_f32 v6, -v3, v5, v4
+; SI-NEXT:    v_fma_f32 v5, v6, v2, v5
+; SI-NEXT:    v_fma_f32 v3, -v3, v5, v4
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
+; SI-NEXT:    v_div_fixup_f32 v1, v2, v1, 1.0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_rsq_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_rsq_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_rsq_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_rsq_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_rsq_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_rsq_v2f16:
+; GFX9-IEEE:       ; %bb.0:
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX9-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-FLUSH-LABEL: v_rsq_v2f16:
+; GFX9-FLUSH:       ; %bb.0:
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, 1.0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v3, 1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, 1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, 1.0
+; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+  %fdiv = fdiv contract <2 x half> <half 1.0, half 1.0>, %sqrt
+  ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
+; GFX6-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX6-IEEE:       ; %bb.0:
+; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT:    v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT:    v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT:    v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT:    v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT:    v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT:    v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT:    s_mov_b64 vcc, s[4:5]
+; GFX6-IEEE-NEXT:    v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT:    v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX6-FLUSH:       ; %bb.0:
+; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, -1.0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_sqrt_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT:    v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT:    v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT:    v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT:    v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT:    v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-LABEL: v_neg_rsq_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_sqrt_f32_e32 v0, v0
+; SI-NEXT:    v_sqrt_f32_e32 v1, v1
+; SI-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; SI-NEXT:    v_rcp_f32_e32 v3, v2
+; SI-NEXT:    v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; SI-NEXT:    v_fma_f32 v3, v5, v3, v3
+; SI-NEXT:    v_mul_f32_e32 v5, v4, v3
+; SI-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; SI-NEXT:    v_fma_f32 v5, v6, v3, v5
+; SI-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; SI-NEXT:    v_div_scale_f32 v3, s[4:5], v1, v1, -1.0
+; SI-NEXT:    v_div_fixup_f32 v0, v2, v0, -1.0
+; SI-NEXT:    v_rcp_f32_e32 v2, v3
+; SI-NEXT:    v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; SI-NEXT:    v_fma_f32 v5, -v3, v2, 1.0
+; SI-NEXT:    v_fma_f32 v2, v5, v2, v2
+; SI-NEXT:    v_mul_f32_e32 v5, v4, v2
+; SI-NEXT:    v_fma_f32 v6, -v3, v5, v4
+; SI-NEXT:    v_fma_f32 v5, v6, v2, v5
+; SI-NEXT:    v_fma_f32 v3, -v3, v5, v4
+; SI-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; SI-NEXT:    v_div_fmas_f32 v2, v3, v2, v5
+; SI-NEXT:    v_div_fixup_f32 v1, v2, v1, -1.0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_neg_rsq_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX8-NEXT:    v_rcp_f16_sdwa v1, -v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GFX8-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_neg_rsq_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f16_e64 v1, -v1
+; GFX9-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: v_neg_rsq_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f16_e64 v1, -v1
+; GFX10-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX10-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_neg_rsq_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_rcp_f16_e64 v0, -v0
+; GFX11-NEXT:    v_rcp_f16_e64 v1, -v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-IEEE-LABEL: v_neg_rsq_v2f16:
+; GFX9-IEEE:       ; %bb.0:
+; GFX9-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-IEEE-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX9-IEEE-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-IEEE-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GFX9-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX9-IEEE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-IEEE-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX9-IEEE-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-IEEE-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-FLUSH-LABEL: v_neg_rsq_v2f16:
+; GFX9-FLUSH:       ; %bb.0:
+; GFX9-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLUSH-NEXT:    v_sqrt_f16_e32 v1, v0
+; GFX9-FLUSH-NEXT:    v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT:    v_cvt_f32_f16_e32 v3, v0
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX9-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v2, -1.0, v2, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_mad_mixlo_f16 v3, -1.0, v3, 0 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v1, v2, v1, -1.0
+; GFX9-FLUSH-NEXT:    v_div_fixup_f16 v0, v3, v0, -1.0
+; GFX9-FLUSH-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX9-FLUSH-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
+  %fdiv = fdiv contract <2 x half> <half -1.0, half -1.0>, %sqrt
+  ret <2 x half> %fdiv
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 declare half @llvm.sqrt.f16(half) #2
 declare half @llvm.fabs.f16(half) #2
+declare <2 x half> @llvm.sqrt.v2f16(<2 x half>) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "unsafe-fp-math"="true" }
-
-!0 = !{float 2.500000e+00}

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index 97fbdd59f71ee8..ac3596f532293f 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -849,6 +849,81 @@ define float @v_recip_sqrt_f32_ulp25(float %x) {
   ret float %fdiv
 }
 
+define float @v_recip_sqrt_f32_ulp25_contract(float %x) {
+; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x800000
+; CODEGEN-IEEE-SDAG-NEXT:    v_mov_b32_e32 v1, 0x4b800000
+; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_mov_b32_e32 v1, 0x45800000
+; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4b800000
+; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x45800000
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; IR-IEEE:       ; %bb.0:
+; IR-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-IEEE-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-IEEE-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-IEEE-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; CODEGEN-DAZ:       ; %bb.0:
+; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25_contract:
+; IR-DAZ:       ; %bb.0:
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !0
+  %fdiv = fdiv contract float 1.0, %sqrt, !fpmath !0
+  ret float %fdiv
+}
+
 define float @v_recip_sqrt_f32_afn_ulp25(float %x) {
 ; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25:
 ; CHECK:       ; %bb.0:
@@ -860,6 +935,17 @@ define float @v_recip_sqrt_f32_afn_ulp25(float %x) {
   ret float %fdiv
 }
 
+define float @v_recip_sqrt_f32_afn_ulp25_contract(float %x) {
+; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rsq_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract afn float @llvm.sqrt.f32(float %x), !fpmath !0
+  %fdiv = fdiv contract afn float 1.0, %sqrt, !fpmath !0
+  ret float %fdiv
+}
+
 declare float @llvm.sqrt.f32(float)
 
 !0 = !{float 2.500000e+00}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 0f2eb38f44cd8d..6af67b1ddafac3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -4,6 +4,7 @@ declare float @llvm.amdgcn.rcp.f32(float) #0
 declare double @llvm.amdgcn.rcp.f64(double) #0
 
 declare double @llvm.amdgcn.sqrt.f64(double) #0
+declare float @llvm.amdgcn.sqrt.f32(float) #0
 declare double @llvm.sqrt.f64(double) #0
 declare float @llvm.sqrt.f32(float) #0
 
@@ -66,8 +67,28 @@ define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(ptr addrspace(1) %ou
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
 ; SI: v_rsq_f32_e32
 define amdgpu_kernel void @safe_rsq_rcp_pat_f32(ptr addrspace(1) %out, float %src) #1 {
-  %sqrt = call float @llvm.sqrt.f32(float %src)
-  %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
+  %sqrt = call contract float @llvm.sqrt.f32(float %src)
+  %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt)
+  store float %rcp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_amdgcn_sqrt_f32:
+; SI: v_sqrt_f32_e32
+; SI: v_rcp_f32_e32
+define amdgpu_kernel void @safe_rsq_rcp_pat_amdgcn_sqrt_f32(ptr addrspace(1) %out, float %src) #1 {
+  %sqrt = call contract float @llvm.amdgcn.sqrt.f32(float %src)
+  %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt)
+  store float %rcp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_amdgcn_sqrt_f32_nocontract:
+; SI: v_sqrt_f32_e32
+; SI: v_rcp_f32_e32
+define amdgpu_kernel void @safe_rsq_rcp_pat_amdgcn_sqrt_f32_nocontract(ptr addrspace(1) %out, float %src) #1 {
+  %sqrt = call float @llvm.amdgcn.sqrt.f32(float %src)
+  %rcp = call contract float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, ptr addrspace(1) %out, align 4
   ret void
 }

diff  --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
index b31af6f8691f48..31c4252303b042 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
@@ -1167,6 +1167,54 @@ define float @v_rsq_f32(float %val) {
   ret float %div
 }
 
+define { float, float } @v_rsq_f32_multi_use(float %val) {
+; GCN-DAZ-LABEL: v_rsq_f32_multi_use:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v2, v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use:
+; GCN-IEEE-UNSAFE:       ; %bb.0:
+; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-UNSAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-IEEE-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
+  %insert.0 = insertvalue { float, float } poison, float %sqrt, 0
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  %insert.1 = insertvalue { float, float } %insert.0, float %div, 1
+  ret { float, float } %insert.1
+}
+
 define float @v_rsq_f32_missing_contract0(float %val) {
 ; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0:
 ; GCN-DAZ:       ; %bb.0:


        


More information about the llvm-commits mailing list