[llvm] 8ce75ac - AMDGPU: Expand and modernize llvm.sqrt.f32 tests
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 23 17:41:49 PDT 2023
Author: Matt Arsenault
Date: 2023-08-23T20:39:18-04:00
New Revision: 8ce75acd1a5d2affd36697220d595d62adf0df55
URL: https://github.com/llvm/llvm-project/commit/8ce75acd1a5d2affd36697220d595d62adf0df55
DIFF: https://github.com/llvm/llvm-project/commit/8ce75acd1a5d2affd36697220d595d62adf0df55.diff
LOG: AMDGPU: Expand and modernize llvm.sqrt.f32 tests
Added:
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
Modified:
Removed:
llvm/test/CodeGen/AMDGPU/fsqrt.ll
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
new file mode 100644
index 00000000000000..a34e447d202bdd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-sqrt.ll
@@ -0,0 +1,383 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare -denormal-fp-math-f32=ieee %s | FileCheck -check-prefixes=CHECK,IEEE %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare -denormal-fp-math-f32=dynamic %s | FileCheck -check-prefixes=CHECK,IEEE %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=hawaii -passes=amdgpu-codegenprepare -denormal-fp-math-f32=preserve-sign %s | FileCheck -check-prefixes=CHECK,DAZ %s
+
+define amdgpu_kernel void @noop_sqrt_fpmath(ptr addrspace(1) %out, float %x) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @noop_sqrt_fpmath
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32(ptr addrspace(1) %out, float %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_v2f32(ptr addrspace(1) %out, <2 x float> %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_v2f32
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], <2 x float> [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]])
+; CHECK-NEXT: store volatile <2 x float> [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile <2 x float> [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile <2 x float> [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile <2 x float> [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile <2 x float> [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile <2 x float> [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+ store volatile <2 x float> %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
+ store volatile <2 x float> %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
+ store volatile <2 x float> %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !0
+ store volatile <2 x float> %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !3
+ store volatile <2 x float> %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+ store volatile <2 x float> %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub(ptr addrspace(1) %out, float nofpclass(sub) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nosub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(sub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero(ptr addrspace(1) %out, float nofpclass(nzero) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub(ptr addrspace(1) %out, float nofpclass(nzero nsub) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(nzero nsub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf(ptr addrspace(1) %out, float nofpclass(nzero nsub inf) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nonzero_nonsub_noinf
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(inf nzero nsub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub(ptr addrspace(1) %out, float nofpclass(psub) %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_known_nopsub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float nofpclass(psub) [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_afn(ptr addrspace(1) %out, float %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_afn
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[NO_MD:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %no.md = call afn float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call afn float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+define amdgpu_kernel void @sqrt_fpmath_f32_assume_nosub(ptr addrspace(1) %out, float %x) {
+; CHECK-LABEL: define amdgpu_kernel void @sqrt_fpmath_f32_assume_nosub
+; CHECK-SAME: (ptr addrspace(1) [[OUT:%.*]], float [[X:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[FABS_X:%.*]] = call float @llvm.fabs.f32(float [[X]])
+; CHECK-NEXT: [[IS_NOT_SUBNORMAL:%.*]] = fcmp oge float [[FABS_X]], 0x3810000000000000
+; CHECK-NEXT: call void @llvm.assume(i1 [[IS_NOT_SUBNORMAL]])
+; CHECK-NEXT: [[NO_MD:%.*]] = call float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_HALF_ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !1
+; CHECK-NEXT: store volatile float [[MD_HALF_ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_1ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !2
+; CHECK-NEXT: store volatile float [[MD_1ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_25ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !3
+; CHECK-NEXT: store volatile float [[MD_25ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_2ULP:%.*]] = call float @llvm.sqrt.f32(float [[X]]), !fpmath !4
+; CHECK-NEXT: store volatile float [[MD_2ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[MD_3ULP_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
+; CHECK-NEXT: store volatile float [[MD_3ULP]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: [[NO_MD_AFN:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
+; CHECK-NEXT: store volatile float [[NO_MD_AFN]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %fabs.x = call float @llvm.fabs.f32(float %x)
+ %is.not.subnormal = fcmp oge float %fabs.x, 0x3810000000000000
+ call void @llvm.assume(i1 %is.not.subnormal)
+
+ %no.md = call float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md, ptr addrspace(1) %out, align 4
+
+ %md.half.ulp = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ store volatile float %md.half.ulp, ptr addrspace(1) %out, align 4
+
+ %md.1ulp = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ store volatile float %md.1ulp, ptr addrspace(1) %out, align 4
+
+ %md.25ulp = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ store volatile float %md.25ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %md.2ulp = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ store volatile float %md.2ulp, ptr addrspace(1) %out, align 4
+
+ %md.3ulp.afn = call afn float @llvm.sqrt.f32(float %x), !fpmath !3
+ store volatile float %md.3ulp, ptr addrspace(1) %out, align 4
+
+ %no.md.afn = call afn float @llvm.sqrt.f32(float %x)
+ store volatile float %no.md.afn, ptr addrspace(1) %out, align 4
+
+ ret void
+}
+
+declare float @llvm.sqrt.f32(float)
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
+declare float @llvm.fabs.f32(float)
+declare void @llvm.assume(i1 noundef)
+
+attributes #0 = { optnone noinline }
+
+!0 = !{float 2.500000e+00}
+!1 = !{float 5.000000e-01}
+!2 = !{float 1.000000e+00}
+!3 = !{float 3.000000e+00}
+!4 = !{float 2.000000e+00}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; DAZ: {{.*}}
+; IEEE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
new file mode 100644
index 00000000000000..5942c778c3d47f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -0,0 +1,1316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SDAG,GCN-IEEE,SDAG-IEEE %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GISEL,GCN-IEEE,GISEL-IEEE %s
+
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,SDAG,GCN-DAZ,SDAG-DAZ %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN,GISEL,GCN-DAZ,GISEL-DAZ %s
+
+define float @v_sqrt_f32(float %x) {
+; GCN-LABEL: v_sqrt_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_fneg(float %x) {
+; GCN-LABEL: v_sqrt_f32_fneg:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e64 v0, -v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %x.neg = fneg float %x
+ %result = call float @llvm.sqrt.f32(float %x.neg)
+ ret float %result
+}
+
+define float @v_sqrt_f32_fabs(float %x) {
+; GCN-LABEL: v_sqrt_f32_fabs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %x.fabs = call float @llvm.fabs.f32(float %x)
+ %result = call float @llvm.sqrt.f32(float %x.fabs)
+ ret float %result
+}
+
+define float @v_sqrt_f32_fneg_fabs(float %x) {
+; GCN-LABEL: v_sqrt_f32_fneg_fabs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e64 v0, -|v0|
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %x.fabs = call float @llvm.fabs.f32(float %x)
+ %x.fabs.neg = fneg float %x.fabs
+ %result = call float @llvm.sqrt.f32(float %x.fabs.neg)
+ ret float %result
+}
+
+define float @v_sqrt_f32_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call ninf float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_no_infs_attribute(float %x) #5 {
+; GCN-LABEL: v_sqrt_f32_no_infs_attribute:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call ninf float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_nnan(float %x) {
+; GCN-LABEL: v_sqrt_f32_nnan:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define amdgpu_ps i32 @s_sqrt_f32(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_f32_e32 v0, s0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %result = call float @llvm.sqrt.f32(float %x)
+ %cast = bitcast float %result to i32
+ %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ninf(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_f32_e32 v0, s0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %result = call ninf float @llvm.sqrt.f32(float %x)
+ %cast = bitcast float %result to i32
+ %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_afn(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_afn:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_f32_e32 v0, s0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %result = call afn float @llvm.sqrt.f32(float %x)
+ %cast = bitcast float %result to i32
+ %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_afn_nnan_ninf(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_afn_nnan_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_f32_e32 v0, s0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %result = call afn nnan ninf float @llvm.sqrt.f32(float %x)
+ %cast = bitcast float %result to i32
+ %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %firstlane
+}
+
+define float @v_sqrt_f32_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_nsz:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nsz float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_nnan_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_nnan_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan ninf float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_nnan_ninf_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_nnan_ninf_nsz:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nnan ninf nsz float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_afn(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_afn_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nsz:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn nsz float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_afn(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+ ret <2 x float> %result
+}
+
+define float @v_sqrt_f32_afn_nnan(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nnan:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn nnan float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_fabs_afn_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_fabs_afn_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %fabs = call float @llvm.fabs.f32(float %x)
+ %result = call afn ninf float @llvm.sqrt.f32(float %fabs)
+ ret float %result
+}
+
+define float @v_sqrt_f32_afn_nnan_ninf(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nnan_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn nnan ninf float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_afn_nnan_ninf(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn_nnan_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn nnan ninf <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+ ret <2 x float> %result
+}
+
+define float @v_sqrt_f32_afn_nnan_ninf_nsz(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_nnan_ninf_nsz:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn nnan ninf nsz float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32__approx_func_fp_math(float %x) #2 {
+; GCN-LABEL: v_sqrt_f32__approx_func_fp_math:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nsz float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32__enough_unsafe_attrs(float %x) #3 {
+; GCN-LABEL: v_sqrt_f32__enough_unsafe_attrs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nsz float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32__unsafe_attr(float %x) #4 {
+; GCN-LABEL: v_sqrt_f32__unsafe_attr:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nsz float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x)
+ ret <2 x float> %result
+}
+
+define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
+; GCN-LABEL: v_sqrt_v3f32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: v_sqrt_f32_e32 v2, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <3 x float> @llvm.sqrt.v3f32(<3 x float> %x)
+ ret <3 x float> %result
+}
+
+; fpmath should be ignored
+define float @v_sqrt_f32_ulp05(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp05:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !0
+ ret float %result
+}
+
+; fpmath should be used with DAZ only
+define float @v_sqrt_f32_ulp1(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !1
+ ret float %result
+}
+
+; fpmath should always be used
+define float @v_sqrt_f32_ulp2(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+; fpmath should always be used
+define float @v_sqrt_f32_ulp25(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp25:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !3
+ ret float %result
+}
+
+; fpmath should always be used
+define float @v_sqrt_f32_ulp3(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !4
+ ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_fabs(float %x) {
+; GCN-LABEL: v_sqrt_f32_ulp2_fabs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %x.fabs = call float @llvm.fabs.f32(float %x)
+ %result = call float @llvm.sqrt.f32(float %x.fabs), !fpmath !2
+ ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
+ ret <2 x float> %result
+}
+
+; fpmath should always be used
+define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp1_fabs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT: v_sqrt_f32_e64 v1, |v1|
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+ %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !1
+ ret <2 x float> %result
+}
+
+; fpmath should always be used
+define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_ulp2_fabs:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e64 v0, |v0|
+; GCN-NEXT: v_sqrt_f32_e64 v1, |v1|
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %x.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %x)
+ %result = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %x.fabs), !fpmath !2
+ ret <2 x float> %result
+}
+
+; afn is stronger than the fpmath
+define float @v_sqrt_f32_afn_ulp1(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_ulp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !1
+ ret float %result
+}
+
+; afn is stronger than the fpmath
+define float @v_sqrt_f32_afn_ulp2(float %x) {
+; GCN-LABEL: v_sqrt_f32_afn_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_afn_ulp1(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn_ulp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !1
+ ret <2 x float> %result
+}
+
+; fpmath should always be used
+define <2 x float> @v_sqrt_v2f32_afn_ulp2(<2 x float> %x) {
+; GCN-LABEL: v_sqrt_v2f32_afn_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !2
+ ret <2 x float> %result
+}
+
+define float @v_sqrt_f32_ulp2_noncontractable_rcp(float %x) {
+; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
+; SDAG-IEEE: ; %bb.0:
+; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000
+; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v1, v0
+; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
+; SDAG-IEEE-NEXT: v_rcp_f32_e32 v1, v1
+; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; SDAG-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
+; GISEL-IEEE: ; %bb.0:
+; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x7f800000
+; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0
+; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v1
+; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, v0, v2, vcc
+; GISEL-IEEE-NEXT: v_rcp_f32_e32 v1, v1
+; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v1, v0
+; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_rcp:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+ %result = fdiv float 1.0, %sqrt, !fpmath !3
+ ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_contractable_rcp(float %x) {
+; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
+; SDAG-IEEE: ; %bb.0:
+; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x4b800000
+; SDAG-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; SDAG-IEEE-NEXT: v_mov_b32_e32 v1, 0x45800000
+; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
+; GISEL-IEEE: ; %bb.0:
+; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x800000
+; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x4b800000
+; GISEL-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
+; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GISEL-IEEE-NEXT: v_mov_b32_e32 v1, 0x45800000
+; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_rcp:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+ %result = fdiv contract float 1.0, %sqrt, !fpmath !3
+ ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
+; GCN-IEEE-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
+; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
+; SDAG-DAZ: ; %bb.0:
+; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x6f800000
+; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x2f800000
+; SDAG-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4
+; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2
+; SDAG-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_noncontractable_fdiv:
+; GISEL-DAZ: ; %bb.0:
+; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x6f800000
+; GISEL-DAZ-NEXT: v_mov_b32_e32 v3, 0x2f800000
+; GISEL-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, v2
+; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2
+; GISEL-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+ %result = fdiv float %y, %sqrt, !fpmath !3
+ ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
+; GCN-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v1
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v3, v2
+; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v1, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
+; SDAG-DAZ: ; %bb.0:
+; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: s_mov_b32 s4, 0x6f800000
+; SDAG-DAZ-NEXT: v_mov_b32_e32 v2, 0x2f800000
+; SDAG-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4
+; SDAG-DAZ-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2
+; SDAG-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv:
+; GISEL-DAZ: ; %bb.0:
+; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GISEL-DAZ-NEXT: v_mov_b32_e32 v2, 0x6f800000
+; GISEL-DAZ-NEXT: v_mov_b32_e32 v3, 0x2f800000
+; GISEL-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, v2
+; GISEL-DAZ-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v0, v2
+; GISEL-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+ %result = fdiv contract float %y, %sqrt, !fpmath !3
+ ret float %result
+}
+
+define float @v_sqrt_f32_ulp2_contractable_fdiv_arcp(float %x, float %y) {
+; SDAG-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; SDAG-IEEE: ; %bb.0:
+; SDAG-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-IEEE-NEXT: s_mov_b32 s4, 0x7f800000
+; SDAG-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0
+; SDAG-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; SDAG-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; SDAG-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; SDAG-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; SDAG-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0
+; SDAG-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0
+; SDAG-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-IEEE-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; GISEL-IEEE: ; %bb.0:
+; GISEL-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GISEL-IEEE-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GISEL-IEEE-NEXT: v_frexp_mant_f32_e32 v3, v0
+; GISEL-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GISEL-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v3, vcc
+; GISEL-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GISEL-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GISEL-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; GISEL-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0
+; GISEL-IEEE-NEXT: v_mul_f32_e32 v0, v1, v0
+; GISEL-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; SDAG-DAZ: ; %bb.0:
+; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0
+; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_f32_ulp2_contractable_fdiv_arcp:
+; GISEL-DAZ: ; %bb.0:
+; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v1, v0
+; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract float @llvm.sqrt.f32(float %x), !fpmath !4
+ %result = fdiv arcp contract float %y, %sqrt, !fpmath !3
+ ret float %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v0
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v0, v2, vcc
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v1, v1
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+ %result = fdiv <2 x float> <float 1.0, float 1.0>, %sqrt, !fpmath !3
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x4b800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v3, 1.0, v2, vcc
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v3
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v2, s[4:5]
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_mov_b32_e32 v3, 0x45800000
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v2
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+ %result = fdiv contract <2 x float> <float 1.0, float 1.0>, %sqrt, !fpmath !3
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x float> %y) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v5, v2
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, s4
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v5, v2, v5, vcc
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v2, v2
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v4, v4
+; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, v2, v0
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT: v_mul_f32_e32 v4, v5, v4
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v3
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v3|, s4
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, v3, v4, vcc
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v3, v3
+; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v4, v2
+; GCN-IEEE-NEXT: v_sub_i32_e32 v1, vcc, v3, v1
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-DAZ-NEXT: s_mov_b32 s4, 0x6f800000
+; GCN-DAZ-NEXT: v_mov_b32_e32 v4, 0x2f800000
+; GCN-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, s4
+; GCN-DAZ-NEXT: v_cndmask_b32_e32 v5, 1.0, v4, vcc
+; GCN-DAZ-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, s4
+; GCN-DAZ-NEXT: v_cndmask_b32_e32 v4, 1.0, v4, vcc
+; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v0, v5
+; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v1, v4
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v1, v1
+; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0
+; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v3, v1
+; GCN-DAZ-NEXT: v_mul_f32_e32 v0, v5, v0
+; GCN-DAZ-NEXT: v_mul_f32_e32 v1, v4, v1
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+ %result = fdiv contract <2 x float> %y, %sqrt, !fpmath !3
+ ret <2 x float> %result
+}
+
+define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2 x float> %y) {
+; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x7f800000
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v4, v0
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v4, v4
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v4, v0
+; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v2, v0
+; GCN-IEEE-NEXT: v_frexp_mant_f32_e32 v2, v1
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
+; GCN-IEEE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v2, v1
+; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v3, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
+; SDAG-DAZ: ; %bb.0:
+; SDAG-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: v_sqrt_f32_e32 v1, v1
+; SDAG-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT: v_rcp_f32_e32 v1, v1
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT: v_mul_f32_e32 v1, v3, v1
+; SDAG-DAZ-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
+; GISEL-DAZ: ; %bb.0:
+; GISEL-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GISEL-DAZ-NEXT: v_rsq_f32_e32 v1, v1
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT: v_mul_f32_e32 v1, v3, v1
+; GISEL-DAZ-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
+ %result = fdiv arcp contract <2 x float> %y, %sqrt, !fpmath !3
+ ret <2 x float> %result
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ulp1(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ulp1:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_f32_e32 v0, s0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !1
+ %cast = bitcast float %result to i32
+ %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ulp2(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_f32_e32 v0, s0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !2
+ %cast = bitcast float %result to i32
+ %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %firstlane
+}
+
+define amdgpu_ps i32 @s_sqrt_f32_ulp3(float inreg %x) {
+; GCN-LABEL: s_sqrt_f32_ulp3:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_sqrt_f32_e32 v0, s0
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: ; return to shader part epilog
+ %result = call afn float @llvm.sqrt.f32(float %x), !fpmath !4
+ %cast = bitcast float %result to i32
+ %firstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %cast)
+ ret i32 %firstlane
+}
+
+define float @v_sqrt_f32_known_never_posdenormal_ulp2(float nofpclass(psub) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_posdenormal_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define float @v_sqrt_f32_nsz_known_never_posdenormal_ulp2(float nofpclass(psub) %x) {
+; GCN-LABEL: v_sqrt_f32_nsz_known_never_posdenormal_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_negdenormal(float nofpclass(nsub) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_negdenormal:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_denormal(float nofpclass(sub) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_denormal:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define float @v_sqrt_f32_ninf_known_never_zero(float nofpclass(zero) %x) {
+; GCN-LABEL: v_sqrt_f32_ninf_known_never_zero:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call ninf float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero(float nofpclass(zero) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_inf(float nofpclass(zero inf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_inf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_ninf(float nofpclass(zero ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_ninf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_pinf(float nofpclass(zero pinf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_pinf:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x)
+ ret float %result
+}
+
+define float @v_sqrt_f32_frexp_src(float %x) {
+; SDAG-LABEL: v_sqrt_f32_frexp_src:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0x7f800000
+; SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0
+; SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SDAG-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f32_frexp_src:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GISEL-NEXT: v_frexp_mant_f32_e32 v1, v0
+; GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT: v_sqrt_f32_e32 v0, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+ %frexp.mant = extractvalue { float, i32 } %frexp, 0
+ %result = call float @llvm.sqrt.f32(float %frexp.mant)
+ ret float %result
+}
+
+define float @v_sqrt_f32_ulp3_frexp_src(float %x) {
+; SDAG-LABEL: v_sqrt_f32_ulp3_frexp_src:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 s4, 0x7f800000
+; SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0
+; SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; SDAG-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sqrt_f32_ulp3_frexp_src:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x7f800000
+; GISEL-NEXT: v_frexp_mant_f32_e32 v1, v0
+; GISEL-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT: v_sqrt_f32_e32 v0, v0
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %x)
+ %frexp.mant = extractvalue { float, i32 } %frexp, 0
+ %result = call float @llvm.sqrt.f32(float %frexp.mant), !fpmath !4
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_zero_never_ninf_ulp2(float nofpclass(zero ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_zero_never_ninf_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define float @v_sqrt_f32_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_known_never_ninf_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define float @v_sqrt_f32_nsz_known_never_ninf_ulp2(float nofpclass(ninf) %x) {
+; GCN-LABEL: v_sqrt_f32_nsz_known_never_ninf_ulp2:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = call nsz float @llvm.sqrt.f32(float %x), !fpmath !2
+ ret float %result
+}
+
+define float @v_elim_redun_check_ult_sqrt(float %in) {
+; SDAG-LABEL: v_elim_redun_check_ult_sqrt:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_elim_redun_check_ult_sqrt:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_sqrt_f32_e32 v1, v0
+; GISEL-NEXT: v_bfrev_b32_e32 v2, 1
+; GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GISEL-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp ult float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ ret float %res
+}
+
+define float @v_elim_redun_check_ult_sqrt_ulp3(float %in) {
+; SDAG-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_sqrt_f32_e32 v0, v0
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_elim_redun_check_ult_sqrt_ulp3:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_sqrt_f32_e32 v1, v0
+; GISEL-NEXT: v_bfrev_b32_e32 v2, 1
+; GISEL-NEXT: v_mov_b32_e32 v3, 0x7fc00000
+; GISEL-NEXT: v_cmp_nge_f32_e32 vcc, v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %sqrt = call float @llvm.sqrt.f32(float %in), !fpmath !4
+ %cmp = fcmp ult float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ ret float %res
+}
+
+define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) {
+; SDAG-LABEL: elim_redun_check_neg0:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_load_dword s2, s[0:1], 0xb
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_neg0:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_load_dword s3, s[0:1], 0xb
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GISEL-NEXT: v_bfrev_b32_e32 v0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_sqrt_f32_e32 v1, s3
+; GISEL-NEXT: v_cmp_lt_f32_e32 vcc, s3, v0
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT: s_endpgm
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp olt float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) {
+; SDAG-LABEL: elim_redun_check_pos0:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_load_dword s2, s[0:1], 0xb
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_pos0:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_load_dword s3, s[0:1], 0xb
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_sqrt_f32_e32 v0, s3
+; GISEL-NEXT: v_cmp_lt_f32_e64 vcc, s3, 0
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT: s_endpgm
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp olt float %in, 0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) {
+; SDAG-LABEL: elim_redun_check_ult:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_load_dword s2, s[0:1], 0xb
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_ult:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_load_dword s3, s[0:1], 0xb
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GISEL-NEXT: v_bfrev_b32_e32 v0, 1
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x7fc00000
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_sqrt_f32_e32 v1, s3
+; GISEL-NEXT: v_cmp_nge_f32_e32 vcc, s3, v0
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GISEL-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GISEL-NEXT: s_endpgm
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp ult float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) {
+; SDAG-LABEL: elim_redun_check_v2:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SDAG-NEXT: s_mov_b32 s6, -1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_sqrt_f32_e32 v1, s3
+; SDAG-NEXT: v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT: s_mov_b32 s4, s0
+; SDAG-NEXT: s_mov_b32 s5, s1
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_v2:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GISEL-NEXT: s_mov_b32 s4, 0x80000000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_sqrt_f32_e32 v2, s2
+; GISEL-NEXT: v_sqrt_f32_e32 v4, s3
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_cmp_gt_f32_e32 vcc, s4, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GISEL-NEXT: s_endpgm
+entry:
+ %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+ %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) {
+; SDAG-LABEL: elim_redun_check_v2_ult:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SDAG-NEXT: s_mov_b32 s6, -1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_sqrt_f32_e32 v1, s3
+; SDAG-NEXT: v_sqrt_f32_e32 v0, s2
+; SDAG-NEXT: s_mov_b32 s4, s0
+; SDAG-NEXT: s_mov_b32 s5, s1
+; SDAG-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: elim_redun_check_v2_ult:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GISEL-NEXT: s_mov_b32 s4, 0x80000000
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_sqrt_f32_e32 v2, s2
+; GISEL-NEXT: v_sqrt_f32_e32 v4, s3
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: v_cmp_nle_f32_e32 vcc, s4, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
+; GISEL-NEXT: v_cmp_nle_f32_e32 vcc, s4, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GISEL-NEXT: s_endpgm
+entry:
+ %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+ %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.sqrt.f32(float) #0
+declare <2 x float> @llvm.fabs.v2f32(<2 x float>) #0
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #0
+declare <3 x float> @llvm.sqrt.v3f32(<3 x float>) #0
+declare i32 @llvm.amdgcn.readfirstlane(i32) #1
+
+declare { float, i32 } @llvm.frexp.f32.i32(float) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #1 = { convergent nounwind willreturn memory(none) }
+attributes #2 = { "approx-func-fp-math"="true" }
+attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
+attributes #4 = { "unsafe-fp-math"="true" }
+attributes #5 = { "no-infs-fp-math"="true" }
+
+!0 = !{float 0.5}
+!1 = !{float 1.0}
+!2 = !{float 2.0}
+!3 = !{float 2.5}
+!4 = !{float 3.0}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.ll
deleted file mode 100644
index 356eed7b31a71d..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.ll
+++ /dev/null
@@ -1,153 +0,0 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
-
-
-; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
-
-; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
-; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @v_safe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
- %r0 = load float, ptr addrspace(1) %in
- %r1 = call float @llvm.sqrt.f32(float %r0)
- store float %r1, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
-; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define amdgpu_kernel void @v_unsafe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #2 {
- %r0 = load float, ptr addrspace(1) %in
- %r1 = call float @llvm.sqrt.f32(float %r0)
- store float %r1, ptr addrspace(1) %out
- ret void
-}
-
-
-; FUNC-LABEL: {{^}}s_sqrt_f32:
-; GCN: v_sqrt_f32_e32
-
-; R600: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].Z
-; R600: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
-define amdgpu_kernel void @s_sqrt_f32(ptr addrspace(1) %out, float %in) #1 {
-entry:
- %fdiv = call float @llvm.sqrt.f32(float %in)
- store float %fdiv, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}s_sqrt_v2f32:
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].W
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].X
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-define amdgpu_kernel void @s_sqrt_v2f32(ptr addrspace(1) %out, <2 x float> %in) #1 {
-entry:
- %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
- store <2 x float> %fdiv, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}s_sqrt_v4f32:
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Y
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Z
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].W
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[4].X
-; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}
-define amdgpu_kernel void @s_sqrt_v4f32(ptr addrspace(1) %out, <4 x float> %in) #1 {
-entry:
- %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
- store <4 x float> %fdiv, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_neg0:
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) #1 {
-entry:
- %sqrt = call float @llvm.sqrt.f32(float %in)
- %cmp = fcmp olt float %in, -0.000000e+00
- %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
- store float %res, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_pos0:
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) #1 {
-entry:
- %sqrt = call float @llvm.sqrt.f32(float %in)
- %cmp = fcmp olt float %in, 0.000000e+00
- %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
- store float %res, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_ult:
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) #1 {
-entry:
- %sqrt = call float @llvm.sqrt.f32(float %in)
- %cmp = fcmp ult float %in, -0.000000e+00
- %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
- store float %res, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_v2:
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) #1 {
-entry:
- %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
- %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
- %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
- store <2 x float> %res, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}elim_redun_check_v2_ult
-; GCN: v_sqrt_f32_e32
-; GCN: v_sqrt_f32_e32
-; GCN-NOT: v_cndmask
-define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) #1 {
-entry:
- %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
- %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
- %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
- store <2 x float> %res, ptr addrspace(1) %out
- ret void
-}
-
-; FUNC-LABEL: {{^}}recip_sqrt:
-; R600: RECIPSQRT_IEEE
-; R600-NOT: RECIP_IEEE
-define amdgpu_kernel void @recip_sqrt(ptr addrspace(1) %out, float %src) nounwind {
- %sqrt = call float @llvm.sqrt.f32(float %src)
- %recipsqrt = fdiv fast float 1.0, %sqrt
- store float %recipsqrt, ptr addrspace(1) %out, align 4
- ret void
-}
-
-declare float @llvm.sqrt.f32(float %in) #0
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
new file mode 100644
index 00000000000000..05a758c1a0fbb6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 %s
+
+; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
+
+define amdgpu_kernel void @v_safe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; R600-LABEL: v_safe_fsqrt_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @6
+; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 8:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 9:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, T0.X,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, PS,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %r0 = load float, ptr addrspace(1) %in
+ %r1 = call float @llvm.sqrt.f32(float %r0)
+ store float %r1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_unsafe_fsqrt_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
+; R600-LABEL: v_unsafe_fsqrt_f32:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
+; R600-NEXT: TEX 0 @6
+; R600-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: Fetch clause starting at 6:
+; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1
+; R600-NEXT: ALU clause starting at 8:
+; R600-NEXT: MOV * T0.X, KC0[2].Z,
+; R600-NEXT: ALU clause starting at 9:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, T0.X,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, PS,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %r0 = load float, ptr addrspace(1) %in
+ %r1 = call float @llvm.sqrt.f32(float %r0)
+ store float %r1, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_sqrt_f32(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: s_sqrt_f32:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, PS,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %fdiv = call float @llvm.sqrt.f32(float %in)
+ store float %fdiv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_sqrt_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: s_sqrt_v2f32:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[2].W,
+; R600-NEXT: RECIPSQRT_IEEE * T0.Y, KC0[3].X,
+; R600-NEXT: RECIP_IEEE * T0.Y, PS,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ store <2 x float> %fdiv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @s_sqrt_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
+; R600-LABEL: s_sqrt_v4f32:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 9, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[3].Y,
+; R600-NEXT: RECIPSQRT_IEEE * T0.Y, KC0[3].Z,
+; R600-NEXT: RECIPSQRT_IEEE * T0.Z, KC0[3].W,
+; R600-NEXT: RECIPSQRT_IEEE * T0.W, KC0[4].X,
+; R600-NEXT: RECIP_IEEE * T0.W, PS,
+; R600-NEXT: RECIP_IEEE * T0.Z, T0.Z,
+; R600-NEXT: RECIP_IEEE * T0.Y, T0.Y,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
+ store <4 x float> %fdiv, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_neg0(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: elim_redun_check_neg0:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, PS,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp olt float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_pos0(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: elim_redun_check_pos0:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, PS,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp olt float %in, 0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_ult(ptr addrspace(1) %out, float %in) {
+; R600-LABEL: elim_redun_check_ult:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[2].Z,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, PS,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %sqrt = call float @llvm.sqrt.f32(float %in)
+ %cmp = fcmp ult float %in, -0.000000e+00
+ %res = select i1 %cmp, float 0x7FF8000000000000, float %sqrt
+ store float %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: elim_redun_check_v2:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[2].W,
+; R600-NEXT: RECIPSQRT_IEEE * T0.Y, KC0[3].X,
+; R600-NEXT: RECIP_IEEE * T0.Y, PS,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+ %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x float> %in) {
+; R600-LABEL: elim_redun_check_v2_ult:
+; R600: ; %bb.0: ; %entry
+; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: RECIPSQRT_IEEE * T0.X, KC0[2].W,
+; R600-NEXT: RECIPSQRT_IEEE * T0.Y, KC0[3].X,
+; R600-NEXT: RECIP_IEEE * T0.Y, PS,
+; R600-NEXT: LSHR T1.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIP_IEEE * T0.X, T0.X,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+entry:
+ %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
+ %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
+ %res = select <2 x i1> %cmp, <2 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000>, <2 x float> %sqrt
+ store <2 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @recip_sqrt(ptr addrspace(1) %out, float %src) nounwind {
+; R600-LABEL: recip_sqrt:
+; R600: ; %bb.0:
+; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[]
+; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
+; R600-NEXT: CF_END
+; R600-NEXT: PAD
+; R600-NEXT: ALU clause starting at 4:
+; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x,
+; R600-NEXT: RECIPSQRT_IEEE * T1.X, KC0[2].Z,
+; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00)
+ %sqrt = call float @llvm.sqrt.f32(float %src)
+ %recipsqrt = fdiv fast float 1.0, %sqrt
+ store float %recipsqrt, ptr addrspace(1) %out, align 4
+ ret void
+}
+
+declare float @llvm.sqrt.f32(float %in) #0
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
More information about the llvm-commits
mailing list