[llvm-branch-commits] [llvm] AMDGPU: Stop requiring afn for f32 rsq formation (PR #172082)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Dec 22 04:51:09 PST 2025


https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/172082

>From 20ff83e7e44422d4872d18e922758837f03c4036 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 12 Dec 2025 19:08:40 +0100
Subject: [PATCH] AMDGPU: Stop requiring afn for f32 rsq formation

We were checking for afn or !fpmath attached to the sqrt. We
are not trying to replace a correctly rounded rsqrt; we're replacing
the two correctly rounded operations with the contracted operation.
It's net a better precision, so contract on both instructions should
be sufficient. Both the contracted and uncontracted sequences pass
the OpenCL conformance test, with a lower maximum error contracted.
---
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |   31 +-
 .../AMDGPU/amdgpu-codegenprepare-fdiv.ll      |   97 +-
 llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll      | 2039 ++++-------------
 3 files changed, 565 insertions(+), 1602 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 0a262b41ab330..5318b86554ce3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -210,8 +210,7 @@ class AMDGPUCodeGenPrepareImpl
   Value *matchFractPat(IntrinsicInst &I);
   Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
 
-  bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
-                          FastMathFlags SqrtFMF) const;
+  bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
 
   Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
                          FastMathFlags DivFMF, FastMathFlags SqrtFMF,
@@ -698,29 +697,11 @@ Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
   return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
 }
 
-bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
-                                                  FastMathFlags DivFMF,
+bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
                                                   FastMathFlags SqrtFMF) const {
-  // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
-  if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
-    return false;
-
-  Type *EltTy = SqrtOp->getType()->getScalarType();
-  switch (EltTy->getTypeID()) {
-  case Type::FloatTyID:
-    // v_rsq_f32 gives 1ulp
-    // Separate correctly rounded fdiv + sqrt give ~1.81 ulp.
-
-    // FIXME: rsq formation should not depend on approx func or the fpmath
-    // accuracy. This strictly improves precision.
-    return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
-  case Type::DoubleTyID:
-    return true;
-  default:
-    return false;
-  }
-
-  llvm_unreachable("covered switch");
+  // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
+  // f64.
+  return DivFMF.allowContract() && SqrtFMF.allowContract();
 }
 
 Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -929,7 +910,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
       DenII->hasOneUse()) {
     const auto *SqrtOp = cast<FPMathOperator>(DenII);
     SqrtFMF = SqrtOp->getFastMathFlags();
-    if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+    if (canOptimizeWithRsq(DivFMF, SqrtFMF))
       RsqOp = SqrtOp->getOperand(0);
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 7ff86ac152feb..cc0d279fe4ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -1563,13 +1563,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
 ; IEEE-GOODFREXP-NEXT:    [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00
 ; IEEE-GOODFREXP-NEXT:    [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]]
 ; IEEE-GOODFREXP-NEXT:    store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-GOODFREXP-NEXT:    [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; IEEE-GOODFREXP-NEXT:    [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]])
-; IEEE-GOODFREXP-NEXT:    [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
-; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1
-; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = sub i32 0, [[TMP28]]
-; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]])
-; IEEE-GOODFREXP-NEXT:    [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]])
+; IEEE-GOODFREXP-NEXT:    [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT:    [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT:    [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]]
+; IEEE-GOODFREXP-NEXT:    [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]])
+; IEEE-GOODFREXP-NEXT:    [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT:    [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]]
 ; IEEE-GOODFREXP-NEXT:    store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-GOODFREXP-NEXT:    [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]])
@@ -1644,13 +1643,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
 ; IEEE-BADFREXP-NEXT:    [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00
 ; IEEE-BADFREXP-NEXT:    [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]]
 ; IEEE-BADFREXP-NEXT:    store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-BADFREXP-NEXT:    [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; IEEE-BADFREXP-NEXT:    [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]])
-; IEEE-BADFREXP-NEXT:    [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_X_HALF_ULP]])
-; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = sub i32 0, [[TMP28]]
-; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]])
-; IEEE-BADFREXP-NEXT:    [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]])
+; IEEE-BADFREXP-NEXT:    [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT:    [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT:    [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]]
+; IEEE-BADFREXP-NEXT:    [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]])
+; IEEE-BADFREXP-NEXT:    [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00
+; IEEE-BADFREXP-NEXT:    [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]]
 ; IEEE-BADFREXP-NEXT:    store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
 ; IEEE-BADFREXP-NEXT:    [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
 ; IEEE-BADFREXP-NEXT:    [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]])
@@ -1701,8 +1699,7 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
 ; DAZ-NEXT:    [[TMP1:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]])
 ; DAZ-NEXT:    [[NEG_FDIV_OPENCL:%.*]] = fneg contract float [[TMP1]]
 ; DAZ-NEXT:    store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT:    [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; DAZ-NEXT:    [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_HALF_ULP]])
+; DAZ-NEXT:    [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]])
 ; DAZ-NEXT:    store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
 ; DAZ-NEXT:    [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
 ; DAZ-NEXT:    [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD1]])
@@ -3490,19 +3487,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; IEEE-GOODFREXP-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
 ; IEEE-GOODFREXP-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
 ; IEEE-GOODFREXP-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; IEEE-GOODFREXP-NEXT:    [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]])
-; IEEE-GOODFREXP-NEXT:    [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0
-; IEEE-GOODFREXP-NEXT:    [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1
-; IEEE-GOODFREXP-NEXT:    [[TMP8:%.*]] = sub i32 0, [[TMP7]]
-; IEEE-GOODFREXP-NEXT:    [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
-; IEEE-GOODFREXP-NEXT:    [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]])
-; IEEE-GOODFREXP-NEXT:    [[TMP11:%.*]] = fneg contract float [[TMP2]]
-; IEEE-GOODFREXP-NEXT:    [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
-; IEEE-GOODFREXP-NEXT:    [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-GOODFREXP-NEXT:    [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1
-; IEEE-GOODFREXP-NEXT:    [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-GOODFREXP-NEXT:    [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-GOODFREXP-NEXT:    [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
+; IEEE-GOODFREXP-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; IEEE-GOODFREXP-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; IEEE-GOODFREXP-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; IEEE-GOODFREXP-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; IEEE-GOODFREXP-NEXT:    [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT:    [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT:    [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]]
+; IEEE-GOODFREXP-NEXT:    [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]])
+; IEEE-GOODFREXP-NEXT:    [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT:    [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]]
+; IEEE-GOODFREXP-NEXT:    [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT:    [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT:    [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]]
+; IEEE-GOODFREXP-NEXT:    [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]])
+; IEEE-GOODFREXP-NEXT:    [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00
+; IEEE-GOODFREXP-NEXT:    [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]]
 ; IEEE-GOODFREXP-NEXT:    [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
 ; IEEE-GOODFREXP-NEXT:    [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
 ; IEEE-GOODFREXP-NEXT:    [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1
@@ -3536,19 +3536,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; IEEE-BADFREXP-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
 ; IEEE-BADFREXP-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
 ; IEEE-BADFREXP-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; IEEE-BADFREXP-NEXT:    [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]])
-; IEEE-BADFREXP-NEXT:    [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]])
-; IEEE-BADFREXP-NEXT:    [[TMP8:%.*]] = sub i32 0, [[TMP7]]
-; IEEE-BADFREXP-NEXT:    [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
-; IEEE-BADFREXP-NEXT:    [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]])
-; IEEE-BADFREXP-NEXT:    [[TMP11:%.*]] = fneg contract float [[TMP2]]
-; IEEE-BADFREXP-NEXT:    [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
-; IEEE-BADFREXP-NEXT:    [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-BADFREXP-NEXT:    [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]])
-; IEEE-BADFREXP-NEXT:    [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-BADFREXP-NEXT:    [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-BADFREXP-NEXT:    [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
+; IEEE-BADFREXP-NEXT:    [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; IEEE-BADFREXP-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; IEEE-BADFREXP-NEXT:    [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; IEEE-BADFREXP-NEXT:    [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; IEEE-BADFREXP-NEXT:    [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT:    [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT:    [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]]
+; IEEE-BADFREXP-NEXT:    [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]])
+; IEEE-BADFREXP-NEXT:    [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00
+; IEEE-BADFREXP-NEXT:    [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]]
+; IEEE-BADFREXP-NEXT:    [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT:    [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT:    [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]]
+; IEEE-BADFREXP-NEXT:    [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]])
+; IEEE-BADFREXP-NEXT:    [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00
+; IEEE-BADFREXP-NEXT:    [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]]
 ; IEEE-BADFREXP-NEXT:    [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
 ; IEEE-BADFREXP-NEXT:    [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
 ; IEEE-BADFREXP-NEXT:    [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]])
@@ -3582,9 +3585,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
 ; DAZ-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
 ; DAZ-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
 ; DAZ-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; DAZ-NEXT:    [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]])
-; DAZ-NEXT:    [[TMP6:%.*]] = fneg contract float [[TMP2]]
-; DAZ-NEXT:    [[TMP7:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
+; DAZ-NEXT:    [[TMP31:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; DAZ-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; DAZ-NEXT:    [[TMP32:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; DAZ-NEXT:    [[TMP33:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; DAZ-NEXT:    [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP31]])
+; DAZ-NEXT:    [[TMP34:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]])
+; DAZ-NEXT:    [[TMP7:%.*]] = fneg contract float [[TMP34]]
 ; DAZ-NEXT:    [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
 ; DAZ-NEXT:    [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0
 ; DAZ-NEXT:    [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
index 7f822c135ffb4..d9fdfb38ef344 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
@@ -12,123 +12,48 @@ declare float @llvm.sqrt.f32(float) nounwind readnone
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
 
 define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-SAFE-LABEL: rsq_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: rsq_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: rsq_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT:    s_endpgm
 ;
-; CI-IEEE-SAFE-LABEL: rsq_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-IEEE-LABEL: rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s6, -1
+; GCN-IEEE-NEXT:    s_mov_b32 s10, s6
+; GCN-IEEE-NEXT:    s_mov_b32 s11, s7
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s8, s2
+; GCN-IEEE-NEXT:    s_mov_b32 s9, s3
+; GCN-IEEE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT:    s_mov_b32 s2, 0x800000
+; GCN-IEEE-NEXT:    s_mov_b32 s4, s0
+; GCN-IEEE-NEXT:    s_mov_b32 s5, s1
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
+; GCN-IEEE-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT:    s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, v0, s2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_cselect_b32 s0, 12, 0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, v0, s0
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %in, align 4
   %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv contract float 1.0, %sqrt, !fpmath !0
@@ -137,109 +62,35 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(
 }
 
 define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) {
-; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, -1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v3, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, s0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
-;
-; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, s0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: rsq_f32_sgpr:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN-DAZ-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s2, -1
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-DAZ-NEXT:    s_endpgm
+;
+; GCN-IEEE-LABEL: rsq_f32_sgpr:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GCN-IEEE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GCN-IEEE-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT:    s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, s6, v0
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_cselect_b32 s4, 12, 0
+; GCN-IEEE-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s2, -1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, v0, s4
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-IEEE-NEXT:    s_endpgm
   %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv contract float 1.0, %sqrt, !fpmath !0
   store float %div, ptr addrspace(1) %out, align 4
@@ -251,106 +102,106 @@ define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %va
 
 ; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
 define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) {
-; GCN-DAZ-SAFE-LABEL: rsqrt_fmul:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0
-; GCN-DAZ-SAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, v2, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v5, v7, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, v7, v8, v7
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v8, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v7, v7, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v8, v5, v7
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v4
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v5, v3
-; GCN-DAZ-SAFE-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
-; GCN-DAZ-SAFE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v7, v5, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, v6, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v3, v7, v6
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, v8, v5, v7
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v3, v7, v6
-; GCN-DAZ-SAFE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GCN-DAZ-SAFE-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
-; GCN-DAZ-SAFE-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; GCN-IEEE-SAFE-LABEL: rsqrt_fmul:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0
-; GCN-IEEE-SAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v5, v2
-; GCN-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[0:1], -1, v5
-; GCN-IEEE-SAFE-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v5
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v9, -v7, v5, v2
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v10, -v8, v5, v2
-; GCN-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v9
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
-; GCN-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v10
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[0:1]
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
-; GCN-IEEE-SAFE-NEXT:    v_div_scale_f32 v3, s[0:1], v2, v2, v4
-; GCN-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v5, v3
-; GCN-IEEE-SAFE-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
-; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v5, v7, v5, v5
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v7, v6, v5
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v8, -v3, v7, v6
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v7, v8, v5, v7
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v3, v7, v6
-; GCN-IEEE-SAFE-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
-; GCN-IEEE-SAFE-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
-; GCN-IEEE-SAFE-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; GCN-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: rsqrt_fmul:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s2, 0
+; GCN-DAZ-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-DAZ-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-DAZ-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-DAZ-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b32 s0, 0xf800000
+; GCN-DAZ-NEXT:    v_mov_b32_e32 v6, 0x260
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
+; GCN-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; GCN-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v5, v2
+; GCN-DAZ-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v7, v2, v5
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v5, 0.5, v5
+; GCN-DAZ-NEXT:    v_fma_f32 v8, -v5, v7, 0.5
+; GCN-DAZ-NEXT:    v_fma_f32 v7, v7, v8, v7
+; GCN-DAZ-NEXT:    v_fma_f32 v5, v5, v8, v5
+; GCN-DAZ-NEXT:    v_fma_f32 v8, -v7, v7, v2
+; GCN-DAZ-NEXT:    v_fma_f32 v5, v8, v5, v7
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
+; GCN-DAZ-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
+; GCN-DAZ-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GCN-DAZ-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v4
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v5, v3
+; GCN-DAZ-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
+; GCN-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GCN-DAZ-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
+; GCN-DAZ-NEXT:    v_fma_f32 v5, v7, v5, v5
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v7, v6, v5
+; GCN-DAZ-NEXT:    v_fma_f32 v8, -v3, v7, v6
+; GCN-DAZ-NEXT:    v_fma_f32 v7, v8, v5, v7
+; GCN-DAZ-NEXT:    v_fma_f32 v3, -v3, v7, v6
+; GCN-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GCN-DAZ-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
+; GCN-DAZ-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
+; GCN-DAZ-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-DAZ-NEXT:    s_endpgm
+;
+; GCN-IEEE-LABEL: rsqrt_fmul:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s2, 0
+; GCN-IEEE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-IEEE-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-IEEE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s0, 0xf800000
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v6, 0x260
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v5, v2
+; GCN-IEEE-NEXT:    v_add_i32_e64 v7, s[0:1], -1, v5
+; GCN-IEEE-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v5
+; GCN-IEEE-NEXT:    v_fma_f32 v9, -v7, v5, v2
+; GCN-IEEE-NEXT:    v_fma_f32 v10, -v8, v5, v2
+; GCN-IEEE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v9
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v10
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[0:1]
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GCN-IEEE-NEXT:    v_div_scale_f32 v3, s[0:1], v2, v2, v4
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v5, v3
+; GCN-IEEE-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
+; GCN-IEEE-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GCN-IEEE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
+; GCN-IEEE-NEXT:    v_fma_f32 v5, v7, v5, v5
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v7, v6, v5
+; GCN-IEEE-NEXT:    v_fma_f32 v8, -v3, v7, v6
+; GCN-IEEE-NEXT:    v_fma_f32 v7, v8, v5, v7
+; GCN-IEEE-NEXT:    v_fma_f32 v3, -v3, v7, v6
+; GCN-IEEE-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
+; GCN-IEEE-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
+; GCN-IEEE-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-IEEE-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -369,123 +220,49 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i
 }
 
 define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-SAFE-LABEL: neg_rsq_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: neg_rsq_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: neg_rsq_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT:    s_endpgm
 ;
-; CI-IEEE-SAFE-LABEL: neg_rsq_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-IEEE-LABEL: neg_rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s6, -1
+; GCN-IEEE-NEXT:    s_mov_b32 s10, s6
+; GCN-IEEE-NEXT:    s_mov_b32 s11, s7
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s8, s2
+; GCN-IEEE-NEXT:    s_mov_b32 s9, s3
+; GCN-IEEE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT:    s_mov_b32 s2, 0x800000
+; GCN-IEEE-NEXT:    s_mov_b32 s4, s0
+; GCN-IEEE-NEXT:    s_mov_b32 s5, s1
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
+; GCN-IEEE-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT:    s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, v0, s2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_cselect_b32 s0, 12, 0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, s0
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %in, align 4
   %sqrt = call contract float @llvm.sqrt.f32(float %val)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -494,123 +271,49 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp
 }
 
 define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
-;
-; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: neg_rsq_neg_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT:    s_endpgm
+;
+; GCN-IEEE-LABEL: neg_rsq_neg_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s6, -1
+; GCN-IEEE-NEXT:    s_mov_b32 s10, s6
+; GCN-IEEE-NEXT:    s_mov_b32 s11, s7
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s8, s2
+; GCN-IEEE-NEXT:    s_mov_b32 s9, s3
+; GCN-IEEE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT:    s_mov_b32 s2, 0x80800000
+; GCN-IEEE-NEXT:    s_mov_b32 s4, s0
+; GCN-IEEE-NEXT:    s_mov_b32 s5, s1
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GCN-IEEE-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GCN-IEEE-NEXT:    s_cselect_b32 s2, 24, 0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, s2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_cselect_b32 s0, 12, 0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, s0
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT:    s_endpgm
   %val = load float, ptr addrspace(1) %in, align 4
   %val.fneg = fneg float %val
   %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
@@ -620,87 +323,24 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
 }
 
 define float @v_neg_rsq_neg_f32(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %val.fneg = fneg float %val
   %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -708,150 +348,32 @@ define float @v_neg_rsq_neg_f32(float %val) {
 }
 
 define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v3, -v0, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v4, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0.5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v6, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v0, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v1, -v1
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v0, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, s[4:5]
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v1, -v1, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v1, -v1, v2
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %val.fneg = fneg <2 x float> %val
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
@@ -859,90 +381,25 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
 }
 
 define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %val0.neg = fneg float %val0
   %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -951,156 +408,34 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
 }
 
 define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, v1, v4
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0.5, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v4, v5, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v6, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v6, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v7, v4, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v5, -v0, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, v0, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v4, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v7, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v4, v4, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v7, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v8, v5, v4
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v6, -v0, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v6, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v1, -v1
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v6, -v0, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v6, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x80800000
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 24, s[4:5]
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v1, -v1, v4
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 12, s[4:5]
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v1, -v1, v4
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %val0.fneg = fneg <2 x float> %val0
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
@@ -1109,324 +444,81 @@ define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x fl
 }
 
 define float @v_neg_rsq_f32(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
   ret float %div
 }
 
 define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v4, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0.5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v6, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_v2f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_v2f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, s[4:5]
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, s[4:5]
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v1, -v1, v2
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
   ret <2 x float> %div
 }
 
 define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
   %user = fmul contract float %div, %val1
@@ -1434,153 +526,34 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
 }
 
 define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, v1, v4
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0.5, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v4, v5, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v6, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v6, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v7, v4, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, v0, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v4, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v7, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v4, v4, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v7, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v8, v5, v4
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_v2f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_v2f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 24, s[4:5]
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v0, -v0, v4
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 12, s[4:5]
+; GCN-IEEE-NEXT:    v_ldexp_f32_e64 v1, -v1, v4
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
   %user = fmul contract <2 x float> %div, %val1
@@ -1594,29 +567,29 @@ define float @v_rsq_f32(float %val) {
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   ret float %div
 }
 
 define { float, float } @v_rsq_f32_multi_use(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_multi_use:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v1, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
 ; SI-IEEE-SAFE:       ; %bb.0:
@@ -1684,12 +657,12 @@ define { float, float } @v_rsq_f32_multi_use(float %val) {
 }
 
 define float @v_rsq_f32_missing_contract0(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
 ; SI-IEEE-SAFE:       ; %bb.0:
@@ -1755,12 +728,12 @@ define float @v_rsq_f32_missing_contract0(float %val) {
 }
 
 define float @v_rsq_f32_missing_contract1(float %val) {
-; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_missing_contract1:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
 ; SI-IEEE-SAFE:       ; %bb.0:
@@ -1835,18 +808,18 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v2, 0x45800000
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GCN-IEEE-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   %add = fadd contract float %div, %val1
@@ -1862,18 +835,18 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v2, 0x45800000
+; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GCN-IEEE-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   %add = fadd contract float %div, %val1
@@ -1889,18 +862,18 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   %add = fadd float %div, %val1
@@ -1931,17 +904,17 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_known_never_posdenormal:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   ret float %div
@@ -1953,4 +926,6 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CI-DAZ-SAFE: {{.*}}
+; GCN-DAZ-SAFE: {{.*}}
+; GCN-IEEE-SAFE: {{.*}}
 ; SI-DAZ-SAFE: {{.*}}



More information about the llvm-branch-commits mailing list