[llvm] 8ce75ac - AMDGPU: Expand and modernize llvm.sqrt.f32 tests

Wed Aug 23 17:41:49 PDT 2023

Author: Matt Arsenault
Date: 2023-08-23T20:39:18-04:00
New Revision: 8ce75acd1a5d2affd36697220d595d62adf0df55

URL: https://github.com/llvm/llvm-project/commit/8ce75acd1a5d2affd36697220d595d62adf0df55
DIFF: https://github.com/llvm/llvm-project/commit/8ce75acd1a5d2affd36697220d595d62adf0df55.diff

LOG: AMDGPU: Expand and modernize llvm.sqrt.f32 tests

Added: 
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
    llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
    llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll

Modified: 
    

Removed: 
    llvm/test/CodeGen/AMDGPU/fsqrt.ll


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
new file mode 100644
index 00000000000000..a34e447d202bdd

--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
@@ -0,0 +1,383 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare -denormal-fp-math-f32=ieee %s | FileCheck -check-prefixes=CHECK,IEEE %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare -denormal-fp-math-f32=dynamic %s | FileCheck -check-prefixes=CHECK,IEEE %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare -denormal-fp-math-f32=preserve-sign %s | FileCheck -check-prefixes=CHECK,DAZ %s
+
+define amdgpu_kernel void @noop_sqrt_fpmath(ptr addrspace(1) %out, float %x) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @noop_sqrt_fpmath
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float> %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_v2f32
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT:    store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile <2 x float> [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile <2 x float> [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+  store volatile <2 x float> %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
+  store volatile <2 x float> %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
+  store volatile <2 x float> %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !0
+  store volatile <2 x float> %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !3
+  store volatile <2 x float> %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+  store volatile <2 x float> %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub(ptr addrspace(1) %out, float nofpclass(sub) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out, float nofpclass(nzero) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1) %out, float nofpclass(nzero nsub) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrspace(1) %out, float nofpclass(nzero nsub inf) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub(ptr addrspace(1) %out, float nofpclass(psub) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(psub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_afn(ptr addrspace(1) %out, float %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_afn
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[NO_MD:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %no.md = call afn float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_assume_nosub(ptr addrspace(1) %out, float %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_assume_nosub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT:    [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]])
+; CHECK-NEXT:    [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT:    store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT:    store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT:    store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT:    store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[MD_3ULP_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT:    store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    [[NO_MD_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT:    store volatile float [[NO_MD_AFN]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT:    ret void
+;
+  %fabs.x = call float @llvm.fabs.f32(float %x)
+  %is.not.subnormal = fcmp oge float %fabs.x, 0x3810000000000000
+  call void @llvm.assume(i1 %is.not.subnormal)
+
+  %no.md = call float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+  %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+  %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+  %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+
+  %md.3ulp.afn = call afn float @llvm.sqrt.f32(float %x), !fpmath !3
+  store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+  %no.md.afn = call afn float @llvm.sqrt.f32(float %x)
+  store volatile float %no.md.afn, ptr addrspace(1) %out, align 4
+
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
+declare float @llvm.fabs.f32(float)
+declare void @llvm.assume(i1 noundef)
+
+attributes #0 = { optnone noinline }
+
+!0 = !{float 2.500000e+00}
+!1 = !{float 5.000000e-01}
+!2 = !{float 1.000000e+00}
+!3 = !{float 3.000000e+00}
+!4 = !{float 2.000000e+00}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; DAZ: {{.*}}
+; IEEE: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
new file mode 100644
index 00000000000000..5942c778c3d47f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -0,0 +1,1316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SDAG,GCN-IEEE,SDAG-IEEE %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GISEL,GCN-IEEE,GISEL-IEEE %s
+
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,SDAG,GCN-DAZ,SDAG-DAZ %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,GISEL,GCN-DAZ,GISEL-DAZ %s
+
+define float @v_sqrt_f32(float %x) {
+; GCN-LABEL: v_sqrt_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_fneg(float %x) {
+; GCN-LABEL: v_sqrt_f32_fneg:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e64 v0, -v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %x.neg = fneg float %x
+  %result = call float @llvm.sqrt.f32(float %x.neg)
+  ret float %result
+}
+
+define float @v_sqrt_f32_fabs(float %x) {
+; GCN-LABEL: v_sqrt_f32_fabs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %result = call float @llvm.sqrt.f32(float %x.fabs)
+  ret float %result
+}
+
+define float @v_sqrt_f32_fneg_fabs(float %x) {
+; GCN-LABEL: v_sqrt_f32_fneg_fabs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e64 v0, -|v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %x.fabs.neg = fneg float %x.fabs
+  %result = call float @llvm.sqrt.f32(float %x.fabs.neg)
+  ret float %result
+}
+
+define float @v_sqrt_f32_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call ninf float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
+; GCN-LABEL: v_sqrt_f32_no_infs_attribute:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call ninf float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_nnan(float %x) {
+; GCN-LABEL: v_sqrt_f32_nnan:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define amdgpu_ps i32 @s_sqrt_f32(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_f32_e32 v0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call float @llvm.sqrt.f32(float %x)
+  %cast = bitcast float %result to i32
+  %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ninf(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_f32_e32 v0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call ninf float @llvm.sqrt.f32(float %x)
+  %cast = bitcast float %result to i32
+  %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_afn(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_afn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_f32_e32 v0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call afn float @llvm.sqrt.f32(float %x)
+  %cast = bitcast float %result to i32
+  %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_afn_nnan_ninf(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_afn_nnan_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_f32_e32 v0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call afn nnan ninf float @llvm.sqrt.f32(float %x)
+  %cast = bitcast float %result to i32
+  %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %firstlane
+}
+
+define float @v_sqrt_f32_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_nsz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nsz float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_nnan_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_nnan_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan ninf float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_nnan_ninf_nsz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nnan ninf nsz float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_afn(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_afn_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nsz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn nsz float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_afn(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+  ret <2 x float> %result
+}
+
+define float @v_sqrt_f32_afn_nnan(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nnan:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn nnan float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_fabs_afn_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_fabs_afn_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %fabs = call float @llvm.fabs.f32(float %x)
+  %result = call afn ninf float @llvm.sqrt.f32(float %fabs)
+  ret float %result
+}
+
+define float @v_sqrt_f32_afn_nnan_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nnan_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn nnan ninf float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_afn_nnan_ninf(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn_nnan_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn nnan ninf <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+  ret <2 x float> %result
+}
+
+define float @v_sqrt_f32_afn_nnan_ninf_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nnan_ninf_nsz:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn nnan ninf nsz float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32__approx_func_fp_math(float %x) #2 {
+; GCN-LABEL: v_sqrt_f32__approx_func_fp_math:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nsz float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32__enough_unsafe_attrs(float %x) #3 {
+; GCN-LABEL: v_sqrt_f32__enough_unsafe_attrs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nsz float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32__unsafe_attr(float %x) #4 {
+; GCN-LABEL: v_sqrt_f32__unsafe_attr:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nsz float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+  ret <2 x float> %result
+}
+
+define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
+; GCN-LABEL: v_sqrt_v3f32:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    v_sqrt_f32_e32 v2, v2
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %x)
+  ret <3 x float> %result
+}
+
+; fpmath should be ignored
+define float @v_sqrt_f32_ulp05(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp05:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  ret float %result
+}
+
+; fpmath should be used with DAZ only
+define float @v_sqrt_f32_ulp1(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !1
+  ret float %result
+}
+
+; fpmath should always be used
+define float @v_sqrt_f32_ulp2(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+; fpmath should always be used
+define float @v_sqrt_f32_ulp25(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp25:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !3
+  ret float %result
+}
+
+; fpmath should always be used
+define float @v_sqrt_f32_ulp3(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !4
+  ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_fabs(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp2_fabs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %x.fabs = call float @llvm.fabs.f32(float %x)
+  %result = call float @llvm.sqrt.f32(float %x.fabs), !fpmath !2
+  ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
+  ret <2 x float> %result
+}
+
+; fpmath should always be used
+define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp1_fabs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT:    v_sqrt_f32_e64 v1, |v1|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+  %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !1
+  ret <2 x float> %result
+}
+
+; fpmath should always be used
+define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp2_fabs:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT:    v_sqrt_f32_e64 v1, |v1|
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+  %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !2
+  ret <2 x float> %result
+}
+
+; afn is stronger than the fpmath
+define float @v_sqrt_f32_afn_ulp1(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_ulp1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !1
+  ret float %result
+}
+
+; afn is stronger than the fpmath
+define float @v_sqrt_f32_afn_ulp2(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_afn_ulp1(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn_ulp1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
+  ret <2 x float> %result
+}
+
+; fpmath should always be used
+define <2 x float> @v_sqrt_v2f32_afn_ulp2(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
+  ret <2 x float> %result
+}
+
+define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
+; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
+; SDAG-IEEE:       ; %bb.0:
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SDAG-IEEE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SDAG-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
+; SDAG-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SDAG-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
+; GISEL-IEEE:       ; %bb.0:
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x7f800000
+; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v0, v2, vcc
+; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v1, v1
+; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+  %result = fdiv float 1.0, %sqrt, !fpmath !3
+  ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
+; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
+; SDAG-IEEE:       ; %bb.0:
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; SDAG-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4b800000
+; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; SDAG-IEEE-NEXT:    v_mov_b32_e32 v1, 0x45800000
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
+; GISEL-IEEE:       ; %bb.0:
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x45800000
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+  %result = fdiv contract float 1.0, %sqrt, !fpmath !3
+  ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
+; GCN-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
+; SDAG-DAZ:       ; %bb.0:
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
+; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
+; GISEL-DAZ:       ; %bb.0:
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x6f800000
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x2f800000
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v2
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GISEL-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+  %result = fdiv float %y, %sqrt, !fpmath !3
+  ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
+; GCN-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
+; SDAG-DAZ:       ; %bb.0:
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
+; SDAG-DAZ-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
+; GISEL-DAZ:       ; %bb.0:
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0x6f800000
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x2f800000
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v2
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GISEL-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+  %result = fdiv contract float %y, %sqrt, !fpmath !3
+  ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
+; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; SDAG-IEEE:       ; %bb.0:
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SDAG-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; SDAG-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; SDAG-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; SDAG-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SDAG-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SDAG-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; GISEL-IEEE:       ; %bb.0:
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v3, vcc
+; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; SDAG-DAZ:       ; %bb.0:
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
+; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; GISEL-DAZ:       ; %bb.0:
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v1, v0
+; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+  %result = fdiv arcp contract float %y, %sqrt, !fpmath !3
+  ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+  %result = fdiv <2 x float> <float 1.0, float 1.0>, %sqrt, !fpmath !3
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v3, 1.0, v2, vcc
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[4:5]
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v3, 0x45800000
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+  %result = fdiv contract <2 x float> <float 1.0, float 1.0>, %sqrt, !fpmath !3
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x float> %y) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v2
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s4
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v5, v2, v5, vcc
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v4, v4
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s4
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v3, v3
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, v3, v1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
+; GCN-DAZ-NEXT:    v_mov_b32_e32 v4, 0x2f800000
+; GCN-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
+; GCN-DAZ-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
+; GCN-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; GCN-DAZ-NEXT:    v_cndmask_b32_e32 v4, 1.0, v4, vcc
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v5
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v5, v0
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+  %result = fdiv contract <2 x float> %y, %sqrt, !fpmath !3
+  ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2 x float> %y) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v4, v4
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
+; SDAG-DAZ:       ; %bb.0:
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
+; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
+; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
+; GISEL-DAZ:       ; %bb.0:
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v1, v1
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+  %result = fdiv arcp contract <2 x float> %y, %sqrt, !fpmath !3
+  ret <2 x float> %result
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ulp1(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ulp1:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_f32_e32 v0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !1
+  %cast = bitcast float %result to i32
+  %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ulp2(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_f32_e32 v0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !2
+  %cast = bitcast float %result to i32
+  %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ulp3(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ulp3:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_sqrt_f32_e32 v0, s0
+; GCN-NEXT:    v_readfirstlane_b32 s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !4
+  %cast = bitcast float %result to i32
+  %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+  ret i32 %firstlane
+}
+
+define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub) %x) {
+; GCN-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_negdenormal:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_denormal(float nofpclass(sub) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_denormal:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
+; GCN-LABEL: v_sqrt_f32_ninf_known_never_zero:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call ninf float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_inf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_ninf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_pinf:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x)
+  ret float %result
+}
+
+define float @v_sqrt_f32_frexp_src(float %x) {
+; SDAG-LABEL: v_sqrt_f32_frexp_src:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f32_frexp_src:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+  %frexp.mant = extractvalue { float, i32 } %frexp, 0
+  %result = call float @llvm.sqrt.f32(float %frexp.mant)
+  ret float %result
+}
+
+define float @v_sqrt_f32_ulp3_frexp_src(float %x) {
+; SDAG-LABEL: v_sqrt_f32_ulp3_frexp_src:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; SDAG-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f32_ulp3_frexp_src:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GISEL-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+  %frexp.mant = extractvalue { float, i32 } %frexp, 0
+  %result = call float @llvm.sqrt.f32(float %frexp.mant), !fpmath !4
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_ninf_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2
+  ret float %result
+}
+
+define float @v_elim_redun_check_ult_sqrt(float %in) {
+; SDAG-LABEL: v_elim_redun_check_ult_sqrt:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_elim_redun_check_ult_sqrt:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_sqrt_f32_e32 v1, v0
+; GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GISEL-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  ret float %res
+}
+
+define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
+; SDAG-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT:    v_sqrt_f32_e32 v1, v0
+; GISEL-NEXT:    v_bfrev_b32_e32 v2, 1
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GISEL-NEXT:    v_cmp_nge_f32_e32 vcc, v0, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %in), !fpmath !4
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  ret float %res
+}
+
+define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) {
+; SDAG-LABEL: elim_redun_check_neg0:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_neg0:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_load_dword s3, s[0:1], 0xb
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GISEL-NEXT:    v_bfrev_b32_e32 v0, 1
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_sqrt_f32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v0
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT:    s_endpgm
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) {
+; SDAG-LABEL: elim_redun_check_pos0:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_pos0:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_load_dword s3, s[0:1], 0xb
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_sqrt_f32_e32 v0, s3
+; GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, s3, 0
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT:    s_endpgm
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) {
+; SDAG-LABEL: elim_redun_check_ult:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_load_dword s2, s[0:1], 0xb
+; SDAG-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SDAG-NEXT:    s_mov_b32 s3, 0xf000
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT:    s_mov_b32 s2, -1
+; SDAG-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_ult:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_load_dword s3, s[0:1], 0xb
+; GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GISEL-NEXT:    v_bfrev_b32_e32 v0, 1
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_sqrt_f32_e32 v1, s3
+; GISEL-NEXT:    v_cmp_nge_f32_e32 vcc, s3, v0
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT:    s_endpgm
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) {
+; SDAG-LABEL: elim_redun_check_v2:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SDAG-NEXT:    s_mov_b32 s6, -1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_sqrt_f32_e32 v1, s3
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT:    s_mov_b32 s4, s0
+; SDAG-NEXT:    s_mov_b32 s5, s1
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_v2:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GISEL-NEXT:    s_mov_b32 s4, 0x80000000
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_sqrt_f32_e32 v2, s2
+; GISEL-NEXT:    v_sqrt_f32_e32 v4, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GISEL-NEXT:    s_endpgm
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) {
+; SDAG-LABEL: elim_redun_check_v2_ult:
+; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SDAG-NEXT:    s_mov_b32 s7, 0xf000
+; SDAG-NEXT:    s_mov_b32 s6, -1
+; SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; SDAG-NEXT:    v_sqrt_f32_e32 v1, s3
+; SDAG-NEXT:    v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT:    s_mov_b32 s4, s0
+; SDAG-NEXT:    s_mov_b32 s5, s1
+; SDAG-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SDAG-NEXT:    s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_v2_ult:
+; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GISEL-NEXT:    s_mov_b32 s4, 0x80000000
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GISEL-NEXT:    v_sqrt_f32_e32 v2, s2
+; GISEL-NEXT:    v_sqrt_f32_e32 v4, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GISEL-NEXT:    v_cmp_nle_f32_e32 vcc, s4, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT:    v_cmp_nle_f32_e32 vcc, s4, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GISEL-NEXT:    s_mov_b32 s2, -1
+; GISEL-NEXT:    s_mov_b32 s3, 0xf000
+; GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GISEL-NEXT:    s_endpgm
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.sqrt.f32(float) #0
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #0
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #0
+declare <3 x float> @llvm.sqrt.v3f32(<3 x float>) #0
+declare i32 @llvm.amdgcn.readfirstlane(i32) #1
+
+declare { float, i32 } @llvm.frexp.f32.i32(float) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nounwind willreturn memory(none) }
+attributes #2 = { "approx-func-fp-math"="true" }
+attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
+attributes #4 = { "unsafe-fp-math"="true" }
+attributes #5 = { "no-infs-fp-math"="true" }
+
+!0 = !{float 0.5}
+!1 = !{float 1.0}
+!2 = !{float 2.0}
+!3 = !{float 2.5}
+!4 = !{float 3.0}

diff  --git a/llvm/test/CodeGen/AMDGPU/fsqrt.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.ll
deleted file mode 100644
index 356eed7b31a71d..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.ll
+++ /dev/null
@@ -1,153 +0,0 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-
-; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
-
-; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
-; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @v_safe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
-  %r0 = load float, ptr addrspace(1) %in
-  %r1 = call float @llvm.sqrt.f32(float %r0)
-  store float %r1, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
-; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @v_unsafe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 {
-  %r0 = load float, ptr addrspace(1) %in
-  %r1 = call float @llvm.sqrt.f32(float %r0)
-  store float %r1, ptr addrspace(1) %out
-  ret void
-}
-
-
-; FUNC-LABEL: {{^}}s_sqrt_f32:
-; GCN: v_sqrt_f32_e32
-
-; R600: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; R600: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
-define amdgpu_kernel void @s_sqrt_f32(ptr addrspace(1) %out, float %in) #1 {
-entry:
-  %fdiv = call float @llvm.sqrt.f32(float %in)
-  store float %fdiv, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_sqrt_v2f32:
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].W
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].X
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-define amdgpu_kernel void @s_sqrt_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 {
-entry:
-  %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  store <2 x float> %fdiv, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}s_sqrt_v4f32:
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].W
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[4].X
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-define amdgpu_kernel void @s_sqrt_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 {
-entry:
-  %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
-  store <4 x float> %fdiv, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_neg0:
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) #1 {
-entry:
-  %sqrt = call float @llvm.sqrt.f32(float %in)
-  %cmp = fcmp olt float %in, -0.000000e+00
-  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
-  store float %res, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_pos0:
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) #1 {
-entry:
-  %sqrt = call float @llvm.sqrt.f32(float %in)
-  %cmp = fcmp olt float %in, 0.000000e+00
-  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
-  store float %res, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_ult:
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) #1 {
-entry:
-  %sqrt = call float @llvm.sqrt.f32(float %in)
-  %cmp = fcmp ult float %in, -0.000000e+00
-  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
-  store float %res, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_v2:
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) #1 {
-entry:
-  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
-  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
-  store <2 x float> %res, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_v2_ult
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) #1 {
-entry:
-  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
-  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
-  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
-  store <2 x float> %res, ptr addrspace(1) %out
-  ret void
-}
-
-; FUNC-LABEL: {{^}}recip_sqrt:
-; R600: RECIPSQRT_IEEE
-; R600-NOT: RECIP_IEEE
-define amdgpu_kernel void @recip_sqrt(ptr addrspace(1) %out, float %src) nounwind {
-  %sqrt = call float @llvm.sqrt.f32(float %src)
-  %recipsqrt = fdiv fast float 1.0, %sqrt
-  store float %recipsqrt, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-declare float @llvm.sqrt.f32(float %in) #0
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }

diff  --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
new file mode 100644
index 00000000000000..05a758c1a0fbb6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
+
+; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
+
+define amdgpu_kernel void @v_safe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; R600-LABEL: v_safe_fsqrt_f32:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    TEX 0 @6
+; R600-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    Fetch clause starting at 6:
+; R600-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT:    ALU clause starting at 8:
+; R600-NEXT:     MOV * T0.X, KC0[2].Z,
+; R600-NEXT:    ALU clause starting at 9:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, T0.X,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, PS,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %r0 = load float, ptr addrspace(1) %in
+  %r1 = call float @llvm.sqrt.f32(float %r0)
+  store float %r1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_unsafe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; R600-LABEL: v_unsafe_fsqrt_f32:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    TEX 0 @6
+; R600-NEXT:    ALU 3, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    Fetch clause starting at 6:
+; R600-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT:    ALU clause starting at 8:
+; R600-NEXT:     MOV * T0.X, KC0[2].Z,
+; R600-NEXT:    ALU clause starting at 9:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, T0.X,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, PS,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %r0 = load float, ptr addrspace(1) %in
+  %r1 = call float @llvm.sqrt.f32(float %r0)
+  store float %r1, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @s_sqrt_f32(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: s_sqrt_f32:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, PS,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %fdiv = call float @llvm.sqrt.f32(float %in)
+  store float %fdiv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @s_sqrt_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: s_sqrt_v2f32:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[2].W,
+; R600-NEXT:     RECIPSQRT_IEEE * T0.Y, KC0[3].X,
+; R600-NEXT:     RECIP_IEEE * T0.Y, PS,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  store <2 x float> %fdiv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @s_sqrt_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
+; R600-LABEL: s_sqrt_v4f32:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[3].Y,
+; R600-NEXT:     RECIPSQRT_IEEE * T0.Y, KC0[3].Z,
+; R600-NEXT:     RECIPSQRT_IEEE * T0.Z, KC0[3].W,
+; R600-NEXT:     RECIPSQRT_IEEE * T0.W, KC0[4].X,
+; R600-NEXT:     RECIP_IEEE * T0.W, PS,
+; R600-NEXT:     RECIP_IEEE * T0.Z, T0.Z,
+; R600-NEXT:     RECIP_IEEE * T0.Y, T0.Y,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+  store <4 x float> %fdiv, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: elim_redun_check_neg0:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, PS,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: elim_redun_check_pos0:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, PS,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: elim_redun_check_ult:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, PS,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %cmp = fcmp ult float %in, -0.000000e+00
+  %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+  store float %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: elim_redun_check_v2:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[2].W,
+; R600-NEXT:     RECIPSQRT_IEEE * T0.Y, KC0[3].X,
+; R600-NEXT:     RECIP_IEEE * T0.Y, PS,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: elim_redun_check_v2_ult:
+; R600:       ; %bb.0: ; %entry
+; R600-NEXT:    ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     RECIPSQRT_IEEE * T0.X, KC0[2].W,
+; R600-NEXT:     RECIPSQRT_IEEE * T0.Y, KC0[3].X,
+; R600-NEXT:     RECIP_IEEE * T0.Y, PS,
+; R600-NEXT:     LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+entry:
+  %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+  %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+  %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+  store <2 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @recip_sqrt(ptr addrspace(1) %out, float %src) nounwind {
+; R600-LABEL: recip_sqrt:
+; R600:       ; %bb.0:
+; R600-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT:    CF_END
+; R600-NEXT:    PAD
+; R600-NEXT:    ALU clause starting at 4:
+; R600-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT:     RECIPSQRT_IEEE * T1.X, KC0[2].Z,
+; R600-NEXT:    2(2.802597e-45), 0(0.000000e+00)
+  %sqrt = call float @llvm.sqrt.f32(float %src)
+  %recipsqrt = fdiv fast float 1.0, %sqrt
+  store float %recipsqrt, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+declare float @llvm.sqrt.f32(float %in) #0
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }