[llvm-branch-commits] [llvm] AMDGPU: Stop requiring afn for f32 rsq formation (PR #172082)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Dec 22 04:51:09 PST 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/172082
>From 20ff83e7e44422d4872d18e922758837f03c4036 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 12 Dec 2025 19:08:40 +0100
Subject: [PATCH] AMDGPU: Stop requiring afn for f32 rsq formation
We were checking for afn or !fpmath attached to the sqrt. We
are not trying to replace a correctly rounded rsqrt; we're replacing
the two correctly rounded operations with the contracted operation.
It's net a better precision, so contract on both instructions should
be sufficient. Both the contracted and uncontracted sequences pass
the OpenCL conformance test, with a lower maximum error contracted.
---
.../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 31 +-
.../AMDGPU/amdgpu-codegenprepare-fdiv.ll | 97 +-
llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll | 2039 ++++-------------
3 files changed, 565 insertions(+), 1602 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 0a262b41ab330..5318b86554ce3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -210,8 +210,7 @@ class AMDGPUCodeGenPrepareImpl
Value *matchFractPat(IntrinsicInst &I);
Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
- bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
- FastMathFlags SqrtFMF) const;
+ bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
FastMathFlags DivFMF, FastMathFlags SqrtFMF,
@@ -698,29 +697,11 @@ Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
}
-bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
- FastMathFlags DivFMF,
+bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
FastMathFlags SqrtFMF) const {
- // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
- if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
- return false;
-
- Type *EltTy = SqrtOp->getType()->getScalarType();
- switch (EltTy->getTypeID()) {
- case Type::FloatTyID:
- // v_rsq_f32 gives 1ulp
- // Separate correctly rounded fdiv + sqrt give ~1.81 ulp.
-
- // FIXME: rsq formation should not depend on approx func or the fpmath
- // accuracy. This strictly improves precision.
- return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
- case Type::DoubleTyID:
- return true;
- default:
- return false;
- }
-
- llvm_unreachable("covered switch");
+ // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
+ // f64.
+ return DivFMF.allowContract() && SqrtFMF.allowContract();
}
Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -929,7 +910,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
DenII->hasOneUse()) {
const auto *SqrtOp = cast<FPMathOperator>(DenII);
SqrtFMF = SqrtOp->getFastMathFlags();
- if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+ if (canOptimizeWithRsq(DivFMF, SqrtFMF))
RsqOp = SqrtOp->getOperand(0);
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 7ff86ac152feb..cc0d279fe4ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -1563,13 +1563,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00
; IEEE-GOODFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]]
; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-GOODFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]])
-; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]]
-; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]])
-; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]])
+; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]]
+; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]])
+; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]]
; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
; IEEE-GOODFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]])
@@ -1644,13 +1643,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00
; IEEE-BADFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]]
; IEEE-BADFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-BADFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]])
-; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
-; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_X_HALF_ULP]])
-; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]]
-; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]])
-; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]])
+; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]]
+; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]])
+; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]]
; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
; IEEE-BADFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]])
@@ -1701,8 +1699,7 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; DAZ-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]])
; DAZ-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fneg contract float [[TMP1]]
; DAZ-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_HALF_ULP]])
+; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]])
; DAZ-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD1]])
@@ -3490,19 +3487,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]])
-; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]]
-; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
-; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]])
-; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]]
-; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
-; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
+; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]]
+; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]])
+; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]]
+; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]]
+; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]])
+; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]]
; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1
@@ -3536,19 +3536,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]])
-; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0
-; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]])
-; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]]
-; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
-; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]])
-; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]]
-; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
-; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]])
-; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
+; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]]
+; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]])
+; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]]
+; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]]
+; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]])
+; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]]
; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]])
@@ -3582,9 +3585,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]])
-; DAZ-NEXT: [[TMP6:%.*]] = fneg contract float [[TMP2]]
-; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
+; DAZ-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; DAZ-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; DAZ-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP31]])
+; DAZ-NEXT: [[TMP34:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]])
+; DAZ-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP34]]
; DAZ-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
; DAZ-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0
; DAZ-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
index 7f822c135ffb4..d9fdfb38ef344 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
@@ -12,123 +12,48 @@ declare float @llvm.sqrt.f32(float) nounwind readnone
declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-SAFE-LABEL: rsq_f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT: s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: rsq_f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT: s_endpgm
+; GCN-DAZ-LABEL: rsq_f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT: s_mov_b32 s6, -1
+; GCN-DAZ-NEXT: s_mov_b32 s10, s6
+; GCN-DAZ-NEXT: s_mov_b32 s11, s7
+; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT: s_mov_b32 s8, s2
+; GCN-DAZ-NEXT: s_mov_b32 s9, s3
+; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT: s_mov_b32 s4, s0
+; GCN-DAZ-NEXT: s_mov_b32 s5, s1
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT: s_endpgm
;
-; CI-IEEE-SAFE-LABEL: rsq_f32:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT: s_endpgm
+; GCN-IEEE-LABEL: rsq_f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT: s_mov_b32 s6, -1
+; GCN-IEEE-NEXT: s_mov_b32 s10, s6
+; GCN-IEEE-NEXT: s_mov_b32 s11, s7
+; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s8, s2
+; GCN-IEEE-NEXT: s_mov_b32 s9, s3
+; GCN-IEEE-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT: s_mov_b32 s2, 0x800000
+; GCN-IEEE-NEXT: s_mov_b32 s4, s0
+; GCN-IEEE-NEXT: s_mov_b32 s5, s1
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
+; GCN-IEEE-NEXT: s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT: s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, v0, s2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_cselect_b32 s0, 12, 0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, v0, s0
+; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %in, align 4
%sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
%div = fdiv contract float 1.0, %sqrt, !fpmath !0
@@ -137,109 +62,35 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(
}
define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) {
-; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, -1
-; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, s0
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v3, v1, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[0:3], 0
-; GCN-DAZ-SAFE-NEXT: s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb
-; SI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT: s_endpgm
-;
-; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_load_dword s0, s[4:5], 0xb
-; CI-IEEE-SAFE-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v0, 0xf800000
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x4f800000
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, s0, v1
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, s0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v2, vcc, -1, v1
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, 1, v1
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT: s_endpgm
+; GCN-DAZ-LABEL: rsq_f32_sgpr:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_load_dword s2, s[4:5], 0xb
+; GCN-DAZ-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-DAZ-NEXT: s_mov_b32 s3, 0xf000
+; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, s2
+; GCN-DAZ-NEXT: s_mov_b32 s2, -1
+; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-DAZ-NEXT: s_endpgm
+;
+; GCN-IEEE-LABEL: rsq_f32_sgpr:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_load_dword s6, s[4:5], 0xb
+; GCN-IEEE-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-IEEE-NEXT: v_mov_b32_e32 v0, 0x800000
+; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
+; GCN-IEEE-NEXT: s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT: s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT: v_mov_b32_e32 v0, s2
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, s6, v0
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_cselect_b32 s4, 12, 0
+; GCN-IEEE-NEXT: s_mov_b32 s3, 0xf000
+; GCN-IEEE-NEXT: s_mov_b32 s2, -1
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, v0, s4
+; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GCN-IEEE-NEXT: s_endpgm
%sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
%div = fdiv contract float 1.0, %sqrt, !fpmath !0
store float %div, ptr addrspace(1) %out, align 4
@@ -251,106 +102,106 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va
; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; GCN-DAZ-SAFE-LABEL: rsqrt_fmul:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s3, 0xf000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0
-; GCN-DAZ-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v1, 0
-; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3]
-; GCN-DAZ-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s0, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v2
-; GCN-DAZ-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v2, v5
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v5, v7, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v7, v8, v7
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v8, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v7, v7, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v8, v5, v7
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3
-; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v5, v3
-; GCN-DAZ-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
-; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6
-; GCN-DAZ-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GCN-DAZ-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7
-; GCN-DAZ-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4
-; GCN-DAZ-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; GCN-DAZ-SAFE-NEXT: s_endpgm
-;
-; GCN-IEEE-SAFE-LABEL: rsqrt_fmul:
-; GCN-IEEE-SAFE: ; %bb.0:
-; GCN-IEEE-SAFE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s3, 0xf000
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s2, 0
-; GCN-IEEE-SAFE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
-; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0
-; GCN-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[8:9], s[6:7]
-; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[10:11], s[2:3]
-; GCN-IEEE-SAFE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
-; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
-; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
-; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v5, v2
-; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[0:1], -1, v5
-; GCN-IEEE-SAFE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v5
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v9, -v7, v5, v2
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v10, -v8, v5, v2
-; GCN-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v9
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1]
-; GCN-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v10
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1]
-; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, v2, v3
-; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4
-; GCN-IEEE-SAFE-NEXT: v_rcp_f32_e32 v5, v3
-; GCN-IEEE-SAFE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
-; GCN-IEEE-SAFE-NEXT: s_mov_b64 s[0:1], s[4:5]
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v5, v7, v5, v5
-; GCN-IEEE-SAFE-NEXT: v_mul_f32_e32 v7, v6, v5
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v8, -v3, v7, v6
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v3, -v3, v7, v6
-; GCN-IEEE-SAFE-NEXT: v_div_fmas_f32 v3, v3, v5, v7
-; GCN-IEEE-SAFE-NEXT: v_div_fixup_f32 v2, v3, v2, v4
-; GCN-IEEE-SAFE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; GCN-IEEE-SAFE-NEXT: s_endpgm
+; GCN-DAZ-LABEL: rsqrt_fmul:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-DAZ-NEXT: s_mov_b32 s3, 0xf000
+; GCN-DAZ-NEXT: s_mov_b32 s2, 0
+; GCN-DAZ-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-DAZ-NEXT: v_mov_b32_e32 v1, 0
+; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GCN-DAZ-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GCN-DAZ-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT: s_mov_b32 s0, 0xf800000
+; GCN-DAZ-NEXT: v_mov_b32_e32 v6, 0x260
+; GCN-DAZ-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
+; GCN-DAZ-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
+; GCN-DAZ-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v5, v2
+; GCN-DAZ-NEXT: s_mov_b64 s[0:1], s[4:5]
+; GCN-DAZ-NEXT: v_mul_f32_e32 v7, v2, v5
+; GCN-DAZ-NEXT: v_mul_f32_e32 v5, 0.5, v5
+; GCN-DAZ-NEXT: v_fma_f32 v8, -v5, v7, 0.5
+; GCN-DAZ-NEXT: v_fma_f32 v7, v7, v8, v7
+; GCN-DAZ-NEXT: v_fma_f32 v5, v5, v8, v5
+; GCN-DAZ-NEXT: v_fma_f32 v8, -v7, v7, v2
+; GCN-DAZ-NEXT: v_fma_f32 v5, v8, v5, v7
+; GCN-DAZ-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
+; GCN-DAZ-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-DAZ-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
+; GCN-DAZ-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GCN-DAZ-NEXT: v_mul_f32_e32 v2, v2, v3
+; GCN-DAZ-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v4
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v5, v3
+; GCN-DAZ-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
+; GCN-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GCN-DAZ-NEXT: v_fma_f32 v7, -v3, v5, 1.0
+; GCN-DAZ-NEXT: v_fma_f32 v5, v7, v5, v5
+; GCN-DAZ-NEXT: v_mul_f32_e32 v7, v6, v5
+; GCN-DAZ-NEXT: v_fma_f32 v8, -v3, v7, v6
+; GCN-DAZ-NEXT: v_fma_f32 v7, v8, v5, v7
+; GCN-DAZ-NEXT: v_fma_f32 v3, -v3, v7, v6
+; GCN-DAZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GCN-DAZ-NEXT: v_div_fmas_f32 v3, v3, v5, v7
+; GCN-DAZ-NEXT: v_div_fixup_f32 v2, v3, v2, v4
+; GCN-DAZ-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-DAZ-NEXT: s_endpgm
+;
+; GCN-IEEE-LABEL: rsqrt_fmul:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-IEEE-NEXT: s_mov_b32 s3, 0xf000
+; GCN-IEEE-NEXT: s_mov_b32 s2, 0
+; GCN-IEEE-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-IEEE-NEXT: v_mov_b32_e32 v1, 0
+; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b64 s[8:9], s[6:7]
+; GCN-IEEE-NEXT: s_mov_b64 s[10:11], s[2:3]
+; GCN-IEEE-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT: buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT: buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s0, 0xf800000
+; GCN-IEEE-NEXT: v_mov_b32_e32 v6, 0x260
+; GCN-IEEE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v2
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s0, v2
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-IEEE-NEXT: v_sqrt_f32_e32 v5, v2
+; GCN-IEEE-NEXT: v_add_i32_e64 v7, s[0:1], -1, v5
+; GCN-IEEE-NEXT: v_add_i32_e64 v8, s[0:1], 1, v5
+; GCN-IEEE-NEXT: v_fma_f32 v9, -v7, v5, v2
+; GCN-IEEE-NEXT: v_fma_f32 v10, -v8, v5, v2
+; GCN-IEEE-NEXT: v_cmp_ge_f32_e64 s[0:1], 0, v9
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[0:1]
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v10
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[0:1]
+; GCN-IEEE-NEXT: v_mul_f32_e32 v7, 0x37800000, v5
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-IEEE-NEXT: v_cmp_class_f32_e32 vcc, v2, v6
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GCN-IEEE-NEXT: v_mul_f32_e32 v2, v2, v3
+; GCN-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v2, v2, v4
+; GCN-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GCN-IEEE-NEXT: v_div_scale_f32 v6, vcc, v4, v2, v4
+; GCN-IEEE-NEXT: s_mov_b64 s[0:1], s[4:5]
+; GCN-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
+; GCN-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
+; GCN-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
+; GCN-IEEE-NEXT: v_fma_f32 v8, -v3, v7, v6
+; GCN-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
+; GCN-IEEE-NEXT: v_fma_f32 v3, -v3, v7, v6
+; GCN-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v7
+; GCN-IEEE-NEXT: v_div_fixup_f32 v2, v3, v2, v4
+; GCN-IEEE-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-IEEE-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
%gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -369,123 +220,49 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i
}
define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-SAFE-LABEL: neg_rsq_f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT: s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: neg_rsq_f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT: s_endpgm
+; GCN-DAZ-LABEL: neg_rsq_f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT: s_mov_b32 s6, -1
+; GCN-DAZ-NEXT: s_mov_b32 s10, s6
+; GCN-DAZ-NEXT: s_mov_b32 s11, s7
+; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT: s_mov_b32 s8, s2
+; GCN-DAZ-NEXT: s_mov_b32 s9, s3
+; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT: s_mov_b32 s4, s0
+; GCN-DAZ-NEXT: s_mov_b32 s5, s1
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT: s_endpgm
;
-; CI-IEEE-SAFE-LABEL: neg_rsq_f32:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT: s_endpgm
+; GCN-IEEE-LABEL: neg_rsq_f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT: s_mov_b32 s6, -1
+; GCN-IEEE-NEXT: s_mov_b32 s10, s6
+; GCN-IEEE-NEXT: s_mov_b32 s11, s7
+; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s8, s2
+; GCN-IEEE-NEXT: s_mov_b32 s9, s3
+; GCN-IEEE-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT: s_mov_b32 s2, 0x800000
+; GCN-IEEE-NEXT: s_mov_b32 s4, s0
+; GCN-IEEE-NEXT: s_mov_b32 s5, s1
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
+; GCN-IEEE-NEXT: s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT: s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, v0, s2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_cselect_b32 s0, 12, 0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, s0
+; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %in, align 4
%sqrt = call contract float @llvm.sqrt.f32(float %val)
%div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -494,123 +271,49 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp
}
define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0x8f800000
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT: s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT: s_endpgm
-;
-; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0x8f800000
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT: s_endpgm
+; GCN-DAZ-LABEL: neg_rsq_neg_f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT: s_mov_b32 s6, -1
+; GCN-DAZ-NEXT: s_mov_b32 s10, s6
+; GCN-DAZ-NEXT: s_mov_b32 s11, s7
+; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT: s_mov_b32 s8, s2
+; GCN-DAZ-NEXT: s_mov_b32 s9, s3
+; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT: s_mov_b32 s4, s0
+; GCN-DAZ-NEXT: s_mov_b32 s5, s1
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT: s_endpgm
+;
+; GCN-IEEE-LABEL: neg_rsq_neg_f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT: s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT: s_mov_b32 s6, -1
+; GCN-IEEE-NEXT: s_mov_b32 s10, s6
+; GCN-IEEE-NEXT: s_mov_b32 s11, s7
+; GCN-IEEE-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s8, s2
+; GCN-IEEE-NEXT: s_mov_b32 s9, s3
+; GCN-IEEE-NEXT: buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT: s_mov_b32 s2, 0x80800000
+; GCN-IEEE-NEXT: s_mov_b32 s4, s0
+; GCN-IEEE-NEXT: s_mov_b32 s5, s1
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0
+; GCN-IEEE-NEXT: s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT: s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, s2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: s_cselect_b32 s0, 12, 0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, s0
+; GCN-IEEE-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT: s_endpgm
%val = load float, ptr addrspace(1) %in, align 4
%val.fneg = fneg float %val
%sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
@@ -620,87 +323,24 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
}
define float @v_neg_rsq_neg_f32(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%val.fneg = fneg float %val
%sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
%div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -708,150 +348,32 @@ define float @v_neg_rsq_neg_f32(float %val) {
}
define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s5
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v3, -v0, s5
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT: v_rsq_f32_e64 v1, -v1
+; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v2, -v1, s7
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v2, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v0, s7
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, s[4:5]
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v1, -v1, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v1, -v1, v2
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%val.fneg = fneg <2 x float> %val
%sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg)
%div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
@@ -859,90 +381,25 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
}
define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%val0.neg = fneg float %val0
%sqrt = call contract float @llvm.sqrt.f32(float %val0.neg)
%div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -951,156 +408,34 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
}
define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, 0x4f800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s5
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e64 v5, -v0, s5
-; GCN-DAZ-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v5, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT: v_rsq_f32_e64 v1, -v1
+; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v2
+; GCN-DAZ-NEXT: v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x8f800000
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0x4f800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v4, -v1, s7
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v1, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e64 v6, -v0, s7
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v6, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 24, vcc
+; GCN-IEEE-NEXT: v_cmp_lt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v4
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 24, s[4:5]
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v1, -v1, v4
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v4
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 12, s[4:5]
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v1, -v1, v4
+; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%val0.fneg = fneg <2 x float> %val0
%sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
%div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
@@ -1109,324 +444,81 @@ define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x fl
}
define float @v_neg_rsq_f32(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v1
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], 1, v1
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %val)
%div = fdiv contract float -1.0, %sqrt, !fpmath !0
ret float %div
}
define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v3, v0
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v4, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0.5, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v2, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v5, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v5, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v6, v3, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_v2f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v1
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v0
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v2, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v2, v4, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v5, v4, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v1
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v2, v1
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_v2f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, s[4:5]
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v1, -v1, v2
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
%div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
ret <2 x float> %div
}
define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v2, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_f32_foldable_user:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v2, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v1
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_f32_foldable_user:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %val0)
%div = fdiv contract float -1.0, %sqrt, !fpmath !0
%user = fmul contract float %div, %val1
@@ -1434,153 +526,34 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
}
define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v4, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, v1, v4
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, 0.5, v4
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v6, -v4, v5, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v6, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v6, v4
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v7, v4, v5
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v5, v0
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v6
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v4, v0, v5
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v7, -v5, v4, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v4, v7, v4
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v8, -v4, v4, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v5, v5, v7, v5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, v8, v5, v4
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v6
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_v2f32_foldable_user:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT: v_mul_f32_e64 v0, -v0, v2
+; GCN-DAZ-NEXT: v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; CI-IEEE-SAFE: ; %bb.0:
-; CI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT: s_mov_b32 s6, 0xf800000
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v4, 0x4f800000, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v4, v1
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v5, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v5, v4, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v6, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v4, -v6, v4, v1
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v5, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; CI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v6, v0
-; CI-IEEE-SAFE-NEXT: v_mov_b32_e32 v5, 0x260
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e64 s[4:5], v1, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v4, s[4:5], -1, v6
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v7, -v4, v6, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e64 s[4:5], 0, v7
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_add_i32_e64 v7, s[4:5], 1, v6
-; CI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v7, v6, v0
-; CI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v6, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; CI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v5
-; CI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v0
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v4, v0
-; CI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e64 v4, -v1
-; CI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT: v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v1, v4, v1
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v0, v0, v2
-; CI-IEEE-SAFE-NEXT: v_mul_f32_e32 v1, v1, v3
-; CI-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_v2f32_foldable_user:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 24, vcc
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v4
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 24, s[4:5]
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v1, v1, v4
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v0, -v0, v4
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v4, 0, 12, s[4:5]
+; GCN-IEEE-NEXT: v_ldexp_f32_e64 v1, -v1, v4
+; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0)
%div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
%user = fmul contract <2 x float> %div, %val1
@@ -1594,29 +567,29 @@ define float @v_rsq_f32(float %val) {
; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32:
-; GCN-IEEE-SAFE: ; %bb.0:
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
%div = fdiv contract float 1.0, %sqrt, !fpmath !1
ret float %div
}
define { float, float } @v_rsq_f32_multi_use(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_multi_use:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v1, v0
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
; SI-IEEE-SAFE: ; %bb.0:
@@ -1684,12 +657,12 @@ define { float, float } @v_rsq_f32_multi_use(float %val) {
}
define float @v_rsq_f32_missing_contract0(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
; SI-IEEE-SAFE: ; %bb.0:
@@ -1755,12 +728,12 @@ define float @v_rsq_f32_missing_contract0(float %val) {
}
define float @v_rsq_f32_missing_contract1(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_missing_contract1:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT: v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT: v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
; SI-IEEE-SAFE: ; %bb.0:
@@ -1835,18 +808,18 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user:
-; GCN-IEEE-SAFE: ; %bb.0:
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1
-; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x45800000
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GCN-IEEE-NEXT: v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
%div = fdiv contract float 1.0, %sqrt, !fpmath !1
%add = fadd contract float %div, %val1
@@ -1862,18 +835,18 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %
; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
-; GCN-IEEE-SAFE: ; %bb.0:
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT: v_mov_b32_e32 v2, 0x45800000
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GCN-IEEE-SAFE-NEXT: v_fma_f32 v0, v0, v2, v1
-; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_mov_b32_e32 v2, 0x45800000
+; GCN-IEEE-NEXT: v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GCN-IEEE-NEXT: v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
%div = fdiv contract float 1.0, %sqrt, !fpmath !1
%add = fadd contract float %div, %val1
@@ -1889,18 +862,18 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %
; GCN-DAZ-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
-; GCN-IEEE-SAFE: ; %bb.0:
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT: v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
%div = fdiv contract float 1.0, %sqrt, !fpmath !1
%add = fadd float %div, %val1
@@ -1931,17 +904,17 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
; GCN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
; GCN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal:
-; GCN-IEEE-SAFE: ; %bb.0:
-; GCN-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT: s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT: v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT: s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_known_never_posdenormal:
+; GCN-IEEE: ; %bb.0:
+; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT: s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT: v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
%div = fdiv contract float 1.0, %sqrt, !fpmath !1
ret float %div
@@ -1953,4 +926,6 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CI-DAZ-SAFE: {{.*}}
+; GCN-DAZ-SAFE: {{.*}}
+; GCN-IEEE-SAFE: {{.*}}
; SI-DAZ-SAFE: {{.*}}
More information about the llvm-branch-commits
mailing list