[llvm] [DAGCombiner] Remove `UnsafeFPMath` usage in `visitFSUBForFMACombine` (PR #145637)

via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 26 00:33:01 PDT 2025


https://github.com/paperchalice updated https://github.com/llvm/llvm-project/pull/145637

>From 01389e1d88e6b43b9478b879456948ad5dc2fca1 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Thu, 26 Jun 2025 11:00:24 +0800
Subject: [PATCH 1/4] [DAGCombiner] Remove UnsafeFPMath usage in
 visitFSUBForFMACombine

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |   8 +-
 llvm/test/CodeGen/AMDGPU/fma-combine.ll       | 342 ++++++++++--------
 .../AMDGPU/fmul-2-combine-multi-use.ll        |  10 +-
 llvm/test/CodeGen/AMDGPU/mad-combine.ll       | 190 +++++++---
 llvm/test/CodeGen/PowerPC/fma-combine.ll      |  41 +--
 5 files changed, 362 insertions(+), 229 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 91f696e8fe88e..9e6e81e2c0dee 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17010,8 +17010,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     return SDValue();
 
   const SDNodeFlags Flags = N->getFlags();
-  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
-                              Options.UnsafeFPMath || HasFMAD);
+  bool AllowFusionGlobally =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || HasFMAD);
 
   // If the subtraction is not contractable, do not combine.
   if (!AllowFusionGlobally && !N->getFlags().hasAllowContract())
@@ -17167,7 +17167,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   auto isReassociable = [&Options](SDNode *N) {
-    return Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
+    return N->getFlags().hasAllowReassociation();
   };
 
   auto isContractableAndReassociableFMUL = [&isContractableFMUL,
@@ -17181,7 +17181,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   // More folding opportunities when target permits.
   if (Aggressive && isReassociable(N)) {
-    bool CanFuse = Options.UnsafeFPMath || N->getFlags().hasAllowContract();
+    bool CanFuse = N->getFlags().hasAllowContract();
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
     if (CanFuse && isFusedOp(N0) &&
diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
index a96d022b66f12..c79cf87712dc0 100644
--- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,11 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
-
-; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs  -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=SI,SI-FMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs  < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-NOFMA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math-f32=ieee -verify-machineinstrs -enable-no-infs-fp-math < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FMA %s
 
 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
@@ -65,8 +63,8 @@ define amdgpu_kernel void @combine_to_fma_f64_0(ptr addrspace(1) noalias %out, p
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fadd double %mul, %c
+  %mul = fmul contract double %a, %b
+  %fma = fadd contract double %mul, %c
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -134,9 +132,9 @@ define amdgpu_kernel void @combine_to_fma_f64_0_2use(ptr addrspace(1) noalias %o
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %fma0 = fadd double %mul, %c
-  %fma1 = fadd double %mul, %d
+  %mul = fmul contract double %a, %b
+  %fma0 = fadd contract double %mul, %c
+  %fma1 = fadd contract double %mul, %d
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -190,8 +188,8 @@ define amdgpu_kernel void @combine_to_fma_f64_1(ptr addrspace(1) noalias %out, p
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fadd double %c, %mul
+  %mul = fmul contract double %a, %b
+  %fma = fadd contract double %c, %mul
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -244,8 +242,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %o
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fsub double %mul, %c
+  %mul = fmul contract double %a, %b
+  %fma = fsub contract double %mul, %c
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -313,9 +311,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(ptr addrspace(1) noali
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %fma0 = fsub double %mul, %c
-  %fma1 = fsub double %mul, %d
+  %mul = fmul contract double %a, %b
+  %fma0 = fsub contract double %mul, %c
+  %fma1 = fsub contract double %mul, %d
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -369,8 +367,8 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %o
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %fma = fsub double %c, %mul
+  %mul = fmul contract double %a, %b
+  %fma = fsub contract double %c, %mul
   store double %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -438,9 +436,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(ptr addrspace(1) noali
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %fma0 = fsub double %c, %mul
-  %fma1 = fsub double %d, %mul
+  %mul = fmul contract double %a, %b
+  %fma0 = fsub contract double %c, %mul
+  %fma1 = fsub contract double %d, %mul
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -494,9 +492,9 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64(ptr addrspace(1) noalias %o
   %b = load volatile double, ptr addrspace(1) %gep.1
   %c = load volatile double, ptr addrspace(1) %gep.2
 
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma = fsub double %mul.neg, %c
+  %mul = fmul contract double %a, %b
+  %mul.neg = fsub contract double -0.0, %mul
+  %fma = fsub contract double %mul.neg, %c
 
   store double %fma, ptr addrspace(1) %gep.out
   ret void
@@ -565,10 +563,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(ptr addrspace(1)
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma0 = fsub double %mul.neg, %c
-  %fma1 = fsub double %mul.neg, %d
+  %mul = fmul contract double %a, %b
+  %mul.neg = fsub contract double -0.0, %mul
+  %fma0 = fsub contract double %mul.neg, %c
+  %fma1 = fsub contract double %mul.neg, %d
 
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
@@ -638,10 +636,10 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
   %c = load volatile double, ptr addrspace(1) %gep.2
   %d = load volatile double, ptr addrspace(1) %gep.3
 
-  %mul = fmul double %a, %b
-  %mul.neg = fsub double -0.0, %mul
-  %fma0 = fsub double %mul.neg, %c
-  %fma1 = fsub double %mul, %d
+  %mul = fmul contract double %a, %b
+  %mul.neg = fsub contract double -0.0, %mul
+  %fma0 = fsub contract double %mul.neg, %c
+  %fma1 = fsub contract double %mul, %d
 
   store volatile double %fma0, ptr addrspace(1) %gep.out.0
   store volatile double %fma1, ptr addrspace(1) %gep.out.1
@@ -650,32 +648,6 @@ define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(ptr addrspace(1)
 
 ; fold (fsub (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, (fneg z)))
 define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; SI-NOFMA:       ; %bb.0:
-; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NOFMA-NEXT:    s_mov_b32 s6, 0
-; SI-NOFMA-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NOFMA-NEXT:    v_mov_b32_e32 v1, 0
-; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NOFMA-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
-; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; SI-NOFMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; SI-NOFMA-NEXT:    v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
-; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
-; SI-NOFMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
-; SI-NOFMA-NEXT:    s_endpgm
-;
 ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; SI-FMA:       ; %bb.0:
 ; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -701,30 +673,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
 ; SI-FMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; SI-FMA-NEXT:    s_endpgm
 ;
-; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
-; GFX11-NOFMA:       ; %bb.0:
-; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
-; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
-; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
-; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT:    s_endpgm
-;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_0_f64:
 ; GFX11-FMA:       ; %bb.0:
 ; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -761,18 +709,16 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1)
   %u = load volatile double, ptr addrspace(1) %gep.3
   %v = load volatile double, ptr addrspace(1) %gep.4
 
-  %tmp0 = fmul double %u, %v
-  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
-  %tmp2 = fsub double %tmp1, %z
+  %tmp0 = fmul contract fast double %u, %v
+  %tmp1 = call contract fast double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub contract fast double %tmp1, %z
 
   store double %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
 
-; fold (fsub x, (fma y, z, (fmul u, v)))
-;   -> (fma (fneg y), z, (fma (fneg u), v, x))
-define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
-; SI-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
+define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_0_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
 ; SI-NOFMA:       ; %bb.0:
 ; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
@@ -793,11 +739,59 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
 ; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NOFMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-NOFMA-NEXT:    v_mul_f64 v[8:9], v[8:9], v[10:11]
-; SI-NOFMA-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
-; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[4:5]
+; SI-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[8:9]
+; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[6:7]
 ; SI-NOFMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; SI-NOFMA-NEXT:    s_endpgm
 ;
+; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_0_f64:
+; GFX11-NOFMA:       ; %bb.0:
+; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT:    v_fma_f64 v[0:1], v[0:1], v[2:3], v[6:7]
+; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[4:5]
+; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT:    s_endpgm
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
+  %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
+  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
+
+  %x = load volatile double, ptr addrspace(1) %gep.0
+  %y = load volatile double, ptr addrspace(1) %gep.1
+  %z = load volatile double, ptr addrspace(1) %gep.2
+  %u = load volatile double, ptr addrspace(1) %gep.3
+  %v = load volatile double, ptr addrspace(1) %gep.4
+
+  %tmp0 = fmul double %u, %v
+  %tmp1 = call double @llvm.fma.f64(double %x, double %y, double %tmp0) #0
+  %tmp2 = fsub double %tmp1, %z
+
+  store double %tmp2, ptr addrspace(1) %gep.out
+  ret void
+}
+
+; fold (fsub x, (fma y, z, (fmul u, v)))
+;   -> (fma (fneg y), z, (fma (fneg u), v, x))
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
 ; SI-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
 ; SI-FMA:       ; %bb.0:
 ; SI-FMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -823,30 +817,6 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
 ; SI-FMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; SI-FMA-NEXT:    s_endpgm
 ;
-; GFX11-NOFMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
-; GFX11-NOFMA:       ; %bb.0:
-; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
-; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
-; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
-; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
-; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
-; GFX11-NOFMA-NEXT:    s_endpgm
-;
 ; GFX11-FMA-LABEL: aggressive_combine_to_fma_fsub_1_f64:
 ; GFX11-FMA:       ; %bb.0:
 ; GFX11-FMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -883,6 +853,78 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1)
   %u = load volatile double, ptr addrspace(1) %gep.3
   %v = load volatile double, ptr addrspace(1) %gep.4
 
+  ; nsz flag is needed since this combine may change sign of zero
+  %tmp0 = fmul contract reassoc nsz double %u, %v
+  %tmp1 = call contract reassoc nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
+  %tmp2 = fsub contract reassoc nsz double %x, %tmp1
+
+  store double %tmp2, ptr addrspace(1) %gep.out
+  ret void
+}
+define amdgpu_kernel void @no_aggressive_combine_to_fma_fsub_1_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+; SI-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
+; SI-NOFMA:       ; %bb.0:
+; SI-NOFMA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; SI-NOFMA-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NOFMA-NEXT:    s_mov_b32 s6, 0
+; SI-NOFMA-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NOFMA-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NOFMA-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[4:5], v[0:1], s[4:7], 0 addr64 offset:8 glc
+; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:16 glc
+; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[8:9], v[0:1], s[4:7], 0 addr64 offset:24 glc
+; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT:    buffer_load_dwordx2 v[10:11], v[0:1], s[4:7], 0 addr64 offset:32 glc
+; SI-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; SI-NOFMA-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NOFMA-NEXT:    v_mul_f64 v[8:9], v[8:9], v[10:11]
+; SI-NOFMA-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[8:9]
+; SI-NOFMA-NEXT:    v_add_f64 v[2:3], v[2:3], -v[4:5]
+; SI-NOFMA-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
+; SI-NOFMA-NEXT:    s_endpgm
+;
+; GFX11-NOFMA-LABEL: no_aggressive_combine_to_fma_fsub_1_f64:
+; GFX11-NOFMA:       ; %bb.0:
+; GFX11-NOFMA-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-NOFMA-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NOFMA-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
+; GFX11-NOFMA-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[0:1], v10, s[2:3] glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[2:3], v10, s[2:3] offset:8 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[4:5], v10, s[2:3] offset:16 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[6:7], v10, s[2:3] offset:24 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    global_load_b64 v[8:9], v10, s[2:3] offset:32 glc dlc
+; GFX11-NOFMA-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NOFMA-NEXT:    v_mul_f64 v[6:7], v[6:7], v[8:9]
+; GFX11-NOFMA-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NOFMA-NEXT:    v_fma_f64 v[2:3], v[2:3], v[4:5], v[6:7]
+; GFX11-NOFMA-NEXT:    v_add_f64 v[0:1], v[0:1], -v[2:3]
+; GFX11-NOFMA-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX11-NOFMA-NEXT:    s_endpgm
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %gep.0 = getelementptr double, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr double, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr double, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr double, ptr addrspace(1) %gep.0, i32 3
+  %gep.4 = getelementptr double, ptr addrspace(1) %gep.0, i32 4
+  %gep.out = getelementptr double, ptr addrspace(1) %out, i32 %tid
+
+  %x = load volatile double, ptr addrspace(1) %gep.0
+  %y = load volatile double, ptr addrspace(1) %gep.1
+  %z = load volatile double, ptr addrspace(1) %gep.2
+  %u = load volatile double, ptr addrspace(1) %gep.3
+  %v = load volatile double, ptr addrspace(1) %gep.4
+
   ; nsz flag is needed since this combine may change sign of zero
   %tmp0 = fmul nsz double %u, %v
   %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0
@@ -979,8 +1021,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_one_y(ptr addrspace(1) %out,
                                         ptr addrspace(1) %in2) {
   %x = load volatile float, ptr addrspace(1) %in1
   %y = load volatile float, ptr addrspace(1) %in2
-  %a = fadd float %x, 1.0
-  %m = fmul float %a, %y
+  %a = fadd contract float %x, 1.0
+  %m = fmul contract float %a, %y
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1068,8 +1110,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_one(ptr addrspace(1) %out,
                                         ptr addrspace(1) %in2) {
   %x = load volatile float, ptr addrspace(1) %in1
   %y = load volatile float, ptr addrspace(1) %in2
-  %a = fadd float %x, 1.0
-  %m = fmul float %y, %a
+  %a = fadd contract float %x, 1.0
+  %m = fmul contract float %y, %a
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1157,8 +1199,8 @@ define amdgpu_kernel void @test_f32_mul_add_x_negone_y(ptr addrspace(1) %out,
                                            ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %a = fadd float %x, -1.0
-  %m = fmul float %a, %y
+  %a = fadd contract float %x, -1.0
+  %m = fmul contract float %a, %y
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1246,8 +1288,8 @@ define amdgpu_kernel void @test_f32_mul_y_add_x_negone(ptr addrspace(1) %out,
                                            ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %a = fadd float %x, -1.0
-  %m = fmul float %y, %a
+  %a = fadd contract float %x, -1.0
+  %m = fmul contract float %y, %a
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1335,8 +1377,8 @@ define amdgpu_kernel void @test_f32_mul_sub_one_x_y(ptr addrspace(1) %out,
                                         ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float 1.0, %x
-  %m = fmul float %s, %y
+  %s = fsub contract float 1.0, %x
+  %m = fmul contract float %s, %y
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1424,8 +1466,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_one_x(ptr addrspace(1) %out,
                                         ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float 1.0, %x
-  %m = fmul float %y, %s
+  %s = fsub contract float 1.0, %x
+  %m = fmul contract float %y, %s
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1513,8 +1555,8 @@ define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(ptr addrspace(1) %out,
                                            ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float -1.0, %x
-  %m = fmul float %s, %y
+  %s = fsub contract float -1.0, %x
+  %m = fmul contract float %s, %y
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1602,8 +1644,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(ptr addrspace(1) %out,
                                          ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float -1.0, %x
-  %m = fmul float %y, %s
+  %s = fsub contract float -1.0, %x
+  %m = fmul contract float %y, %s
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1691,8 +1733,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_one_y(ptr addrspace(1) %out,
                                         ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float %x, 1.0
-  %m = fmul float %s, %y
+  %s = fsub contract float %x, 1.0
+  %m = fmul contract float %s, %y
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1780,8 +1822,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_one(ptr addrspace(1) %out,
                                       ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float %x, 1.0
-  %m = fmul float %y, %s
+  %s = fsub contract float %x, 1.0
+  %m = fmul contract float %y, %s
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1869,8 +1911,8 @@ define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(ptr addrspace(1) %out,
                                          ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float %x, -1.0
-  %m = fmul float %s, %y
+  %s = fsub contract float %x, -1.0
+  %m = fmul contract float %s, %y
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -1958,8 +2000,8 @@ define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(ptr addrspace(1) %out,
                                          ptr addrspace(1) %in2) {
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
-  %s = fsub float %x, -1.0
-  %m = fmul float %y, %s
+  %s = fsub contract float %x, -1.0
+  %m = fmul contract float %y, %s
   store float %m, ptr addrspace(1) %out
   ret void
 }
@@ -2072,10 +2114,10 @@ define amdgpu_kernel void @test_f32_interp(ptr addrspace(1) %out,
   %x = load float, ptr addrspace(1) %in1
   %y = load float, ptr addrspace(1) %in2
   %t = load float, ptr addrspace(1) %in3
-  %t1 = fsub float 1.0, %t
-  %tx = fmul float %x, %t
-  %ty = fmul float %y, %t1
-  %r = fadd float %tx, %ty
+  %t1 = fsub contract float 1.0, %t
+  %tx = fmul contract float %x, %t
+  %ty = fmul contract float %y, %t1
+  %r = fadd contract float %tx, %ty
   store float %r, ptr addrspace(1) %out
   ret void
 }
@@ -2152,10 +2194,10 @@ define amdgpu_kernel void @test_f64_interp(ptr addrspace(1) %out,
   %x = load double, ptr addrspace(1) %in1
   %y = load double, ptr addrspace(1) %in2
   %t = load double, ptr addrspace(1) %in3
-  %t1 = fsub double 1.0, %t
-  %tx = fmul double %x, %t
-  %ty = fmul double %y, %t1
-  %r = fadd double %tx, %ty
+  %t1 = fsub contract double 1.0, %t
+  %tx = fmul contract double %x, %t
+  %ty = fmul contract double %y, %t1
+  %r = fadd contract double %tx, %ty
   store double %r, ptr addrspace(1) %out
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 7c0d3692242a4..d4471c85c467c 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -73,8 +73,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
   %a16 = select i1 %a15, float %a12, float %a14
   %a17 = fmul float %a16, 2.0
   %a18 = fmul float %a17, %a17
-  %a19 = fmul float %a18, %a17
-  %a20 = fsub float 1.0, %a19
+  %a19 = fmul contract float %a18, %a17
+  %a20 = fsub contract float 1.0, %a19
   store float %a20, ptr addrspace(1) %out
   ret void
 }
@@ -540,8 +540,8 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
   %a16 = select i1 %a15, half %a12, half %a14
   %a17 = fmul half %a16, 2.0
   %a18 = fmul half %a17, %a17
-  %a19 = fmul half %a18, %a17
-  %a20 = fsub half 1.0, %a19
+  %a19 = fmul contract half %a18, %a17
+  %a20 = fsub contract half 1.0, %a19
   store half %a20, ptr addrspace(1) %out
   ret void
 }
@@ -1166,7 +1166,7 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
   ret void
 }
 
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11-DENORM: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
index e94aa4b8ce3d1..2ac181b06a350 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll
@@ -1,14 +1,12 @@
 ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
 
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD  -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
-
-; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
 
 ; Make sure we don't form mad with denormals
-; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fabs.f32(float) #0
@@ -25,15 +23,41 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 
 ; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
 
+; SI-DENORM-FASTFMAF: buffer_store_dword [[RESULT]]
+; SI-STD: buffer_store_dword [[C]]
+define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.out = getelementptr float, ptr addrspace(1) %out, i32 %tid
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+
+
+  %mul = fmul contract float %a, %b
+  %fma = fadd contract float %mul, %c
+  store float %fma, ptr addrspace(1) %gep.out
+  ret void
+}
+; FUNC-LABEL: {{^}}no_combine_to_mad_f32_0:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+
+; SI-STD: v_mac_f32_e32 [[C]], [[A]], [[B]]
+
 ; SI-DENORM-SLOWFMAF-NOT: v_fma
 ; SI-DENORM-SLOWFMAF-NOT: v_mad
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]],  [[TMP]], [[C]]
 
-; SI-DENORM: buffer_store_dword [[RESULT]]
+; SI-DENORM-SLOWFMAF: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @no_combine_to_mad_f32_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -72,7 +96,46 @@ define amdgpu_kernel void @combine_to_mad_f32_0(ptr addrspace(1) noalias %out, p
 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_fast) #1 {
+  %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+  %gep.3 = getelementptr float, ptr addrspace(1) %gep.0, i32 3
+  %gep.out.0 = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.out.1 = getelementptr float, ptr addrspace(1) %gep.out.0, i32 1
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+  %d = load volatile float, ptr addrspace(1) %gep.3
+
+  %mul = fmul contract fast float %a, %b
+  %fma0 = fadd contract fast float %mul, %c
+  %fma1 = fadd contract fast float %mul, %d
+  store volatile float %fma0, ptr addrspace(1) %gep.out.0
+  store volatile float %fma1, ptr addrspace(1) %gep.out.1
+  ret void
+}
+; FUNC-LABEL: {{^}}no_combine_to_mad_f32_0_2use:
+; SI-DAG: buffer_load_dword [[A:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 glc{{$}}
+; SI-DAG: buffer_load_dword [[B:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 glc{{$}}
+; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 glc{{$}}
+; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12 glc{{$}}
+
+; SI-STD-DAG: v_mac_f32_e32 [[C]], [[A]], [[B]]
+; SI-STD-DAG: v_mac_f32_e32 [[D]], [[A]], [[B]]
+
+; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[A]], [[B]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
+; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT1:v[0-9]+]], [[TMP]], [[D]]
+
+; SI-DENORM-SLOWFMAF-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-DENORM-SLOWFMAF-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; SI: s_endpgm
+define amdgpu_kernel void @no_combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_fast) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -89,7 +152,6 @@ define amdgpu_kernel void @combine_to_mad_f32_0_2use(ptr addrspace(1) noalias %o
   %mul = fmul float %a, %b
   %fma0 = fadd float %mul, %c
   %fma1 = fadd float %mul, %d
-
   store volatile float %fma0, ptr addrspace(1) %gep.out.0
   store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -120,8 +182,8 @@ define amdgpu_kernel void @combine_to_mad_f32_1(ptr addrspace(1) noalias %out, p
   %b = load volatile float, ptr addrspace(1) %gep.1
   %c = load volatile float, ptr addrspace(1) %gep.2
 
-  %mul = fmul float %a, %b
-  %fma = fadd float %c, %mul
+  %mul = fmul contract float %a, %b
+  %fma = fadd contract float %c, %mul
   store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -150,8 +212,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %o
   %b = load volatile float, ptr addrspace(1) %gep.1
   %c = load volatile float, ptr addrspace(1) %gep.2
 
-  %mul = fmul float %a, %b
-  %fma = fsub float %mul, %c
+  %mul = fmul contract float %a, %b
+  %fma = fsub contract float %mul, %c
   store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -190,9 +252,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(ptr addrspace(1) noali
   %c = load volatile float, ptr addrspace(1) %gep.2
   %d = load volatile float, ptr addrspace(1) %gep.3
 
-  %mul = fmul float %a, %b
-  %fma0 = fsub float %mul, %c
-  %fma1 = fsub float %mul, %d
+  %mul = fmul contract float %a, %b
+  %fma0 = fsub contract float %mul, %c
+  %fma1 = fsub contract float %mul, %d
   store volatile float %fma0, ptr addrspace(1) %gep.out.0
   store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -222,8 +284,8 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32(ptr addrspace(1) noalias %o
   %b = load volatile float, ptr addrspace(1) %gep.1
   %c = load volatile float, ptr addrspace(1) %gep.2
 
-  %mul = fmul float %a, %b
-  %fma = fsub float %c, %mul
+  %mul = fmul contract float %a, %b
+  %fma = fsub contract float %c, %mul
   store float %fma, ptr addrspace(1) %gep.out
   ret void
 }
@@ -262,9 +324,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(ptr addrspace(1) noali
   %c = load volatile float, ptr addrspace(1) %gep.2
   %d = load volatile float, ptr addrspace(1) %gep.3
 
-  %mul = fmul float %a, %b
-  %fma0 = fsub float %c, %mul
-  %fma1 = fsub float %d, %mul
+  %mul = fmul contract float %a, %b
+  %fma0 = fsub contract float %c, %mul
+  %fma1 = fsub contract float %d, %mul
   store volatile float %fma0, ptr addrspace(1) %gep.out.0
   store volatile float %fma1, ptr addrspace(1) %gep.out.1
   ret void
@@ -295,9 +357,9 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %o
   %b = load volatile float, ptr addrspace(1) %gep.1
   %c = load volatile float, ptr addrspace(1) %gep.2
 
-  %mul = fmul float %a, %b
-  %mul.neg = fneg float %mul
-  %fma = fsub float %mul.neg, %c
+  %mul = fmul contract float %a, %b
+  %mul.neg = fneg contract float %mul
+  %fma = fsub contract float %mul.neg, %c
 
   store float %fma, ptr addrspace(1) %gep.out
   ret void
@@ -337,10 +399,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(ptr addrspace(1)
   %c = load volatile float, ptr addrspace(1) %gep.2
   %d = load volatile float, ptr addrspace(1) %gep.3
 
-  %mul = fmul float %a, %b
-  %mul.neg = fneg float %mul
-  %fma0 = fsub float %mul.neg, %c
-  %fma1 = fsub float %mul.neg, %d
+  %mul = fmul contract float %a, %b
+  %mul.neg = fneg contract float %mul
+  %fma0 = fsub contract float %mul.neg, %c
+  %fma1 = fsub contract float %mul.neg, %d
 
   store volatile float %fma0, ptr addrspace(1) %gep.out.0
   store volatile float %fma1, ptr addrspace(1) %gep.out.1
@@ -381,10 +443,10 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1)
   %c = load volatile float, ptr addrspace(1) %gep.2
   %d = load volatile float, ptr addrspace(1) %gep.3
 
-  %mul = fmul float %a, %b
-  %mul.neg = fneg float %mul
-  %fma0 = fsub float %mul.neg, %c
-  %fma1 = fsub float %mul, %d
+  %mul = fmul contract float %a, %b
+  %mul.neg = fneg contract float %mul
+  %fma0 = fsub contract float %mul.neg, %c
+  %fma1 = fsub contract float %mul, %d
 
   store volatile float %fma0, ptr addrspace(1) %gep.out.0
   store volatile float %fma1, ptr addrspace(1) %gep.out.1
@@ -412,7 +474,7 @@ define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(ptr addrspace(1)
 ; SI-DENORM: v_sub_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -427,10 +489,22 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(ptr addrspace(1)
   %u = load volatile float, ptr addrspace(1) %gep.3
   %v = load volatile float, ptr addrspace(1) %gep.4
 
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fma.f32(float %x, float %y, float %tmp0) #0
-  %tmp2 = fsub float %tmp1, %z
+  br i1 %is_aggressive, label %aggressive, label %normal
+
+normal:
+  %tmp0_normal = fmul float %u, %v
+  %tmp1_normal = call float @llvm.fma.f32(float %x, float %y, float %tmp0_normal) #0
+  %tmp2_normal = fsub float %tmp1_normal, %z
+  br label %exit
 
+aggressive:
+  %tmp0_aggressive = fmul contract reassoc float %u, %v
+  %tmp1_aggressive = call contract reassoc float @llvm.fma.f32(float %x, float %y, float %tmp0_aggressive) #0
+  %tmp2_aggressive = fsub contract reassoc float %tmp1_aggressive, %z
+  br label %exit
+
+exit:
+  %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive]
   store float %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
@@ -505,7 +579,7 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(ptr addrspace(1)
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -520,10 +594,22 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1)
   %u = load volatile float, ptr addrspace(1) %gep.3
   %v = load volatile float, ptr addrspace(1) %gep.4
 
-  %tmp0 = fmul float %u, %v
-  %tmp1 = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0) #0
-  %tmp2 = fsub float %tmp1, %z
+  br i1 %is_aggressive, label %aggressive, label %normal
 
+normal:
+  %tmp0_normal = fmul float %u, %v
+  %tmp1_normal = call float @llvm.fmuladd.f32(float %x, float %y, float %tmp0_normal) #0
+  %tmp2_normal = fsub float %tmp1_normal, %z
+  br label %exit
+
+aggressive:
+  %tmp0_aggressive = fmul contract reassoc float %u, %v
+  %tmp1_aggressive = call contract reassoc float @llvm.fmuladd.f32(float %x, float %y, float %tmp0_aggressive) #0
+  %tmp2_aggressive = fsub contract reassoc float %tmp1_aggressive, %z
+  br label %exit
+
+exit:
+  %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive]
   store float %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
@@ -556,7 +642,7 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(ptr addrspace(1)
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i1 %is_aggressive) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
   %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
@@ -571,11 +657,23 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(ptr addrspace(1)
   %u = load volatile float, ptr addrspace(1) %gep.3
   %v = load volatile float, ptr addrspace(1) %gep.4
 
-  ; nsz flag is needed since this combine may change sign of zero
-  %tmp0 = fmul nsz float %u, %v
-  %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0
-  %tmp2 = fsub nsz float %x, %tmp1
+  br i1 %is_aggressive, label %aggressive, label %normal
 
+normal:
+  ; nsz flag is needed since this combine may change sign of zero
+  %tmp0_normal = fmul nsz float %u, %v
+  %tmp1_normal = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0_normal) #0
+  %tmp2_normal = fsub nsz float %x, %tmp1_normal
+  br label %exit
+
+aggressive:
+  %tmp0_aggressive = fmul contract reassoc nsz float %u, %v
+  %tmp1_aggressive = call contract reassoc nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0_aggressive) #0
+  %tmp2_aggressive = fsub contract reassoc nsz float %x, %tmp1_aggressive
+  br label %exit
+
+exit:
+  %tmp2 = phi float [%tmp2_normal, %normal], [%tmp2_aggressive, %aggressive]
   store float %tmp2, ptr addrspace(1) %gep.out
   ret void
 }
diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll
index 3d45e9a3a509c..456f85ad3eefd 100644
--- a/llvm/test/CodeGen/PowerPC/fma-combine.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -enable-no-signed-zeros-fp-math \
-; RUN:     -enable-unsafe-fp-math  < %s | FileCheck -check-prefix=CHECK-FAST %s
+; RUN:       < %s | FileCheck -check-prefix=CHECK-FAST %s
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu -enable-no-signed-zeros-fp-math \
-; RUN:     -enable-unsafe-fp-math -mattr=-vsx < %s | FileCheck -check-prefix=CHECK-FAST-NOVSX %s
+; RUN:      -mattr=-vsx < %s | FileCheck -check-prefix=CHECK-FAST-NOVSX %s
 ; RUN: llc -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
 
 define dso_local double @fma_combine1(double %a, double %b, double %c) {
@@ -19,13 +19,12 @@ define dso_local double @fma_combine1(double %a, double %b, double %c) {
 ; CHECK-LABEL: fma_combine1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsnegdp 0, 3
-; CHECK-NEXT:    xsmuldp 0, 0, 2
-; CHECK-NEXT:    xssubdp 1, 0, 1
+; CHECK-NEXT:    xsmsubadp 1, 0, 2
 ; CHECK-NEXT:    blr
 entry:
   %fneg1 = fneg double %c
-  %mul = fmul double %fneg1, %b
-  %add = fsub double %mul, %a
+  %mul = fmul contract double %fneg1, %b
+  %add = fsub contract double %mul, %a
   ret double %add
 }
 
@@ -43,13 +42,12 @@ define dso_local double @fma_combine2(double %a, double %b, double %c) {
 ; CHECK-LABEL: fma_combine2:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xsnegdp 0, 3
-; CHECK-NEXT:    xsmuldp 0, 2, 0
-; CHECK-NEXT:    xssubdp 1, 0, 1
+; CHECK-NEXT:    xsmsubadp 1, 2, 0
 ; CHECK-NEXT:    blr
 entry:
   %fneg1 = fneg double %c
-  %mul = fmul double %b, %fneg1
-  %add = fsub double %mul, %a
+  %mul = fmul contract double %b, %fneg1
+  %add = fsub contract double %mul, %a
   ret double %add
 }
 
@@ -85,17 +83,16 @@ define dso_local double @fma_combine_two_uses(double %a, double %b, double %c) {
 ; CHECK-NEXT:    stfd 0, v at toc@l(3)
 ; CHECK-NEXT:    xsnegdp 0, 3
 ; CHECK-NEXT:    addis 3, 2, z at toc@ha
+; CHECK-NEXT:    xsmsubadp 1, 0, 2
 ; CHECK-NEXT:    stfd 0, z at toc@l(3)
-; CHECK-NEXT:    xsmuldp 0, 0, 2
-; CHECK-NEXT:    xssubdp 1, 0, 1
 ; CHECK-NEXT:    blr
 entry:
   %fneg = fneg double %a
   store double %fneg, ptr @v, align 8
   %fneg1 = fneg double %c
   store double %fneg1, ptr @z, align 8
-  %mul = fmul double %fneg1, %b
-  %add = fsub double %mul, %a
+  %mul = fmul contract double %fneg1, %b
+  %add = fsub contract double %mul, %a
   ret double %add
 }
 
@@ -122,15 +119,14 @@ define dso_local double @fma_combine_one_use(double %a, double %b, double %c) {
 ; CHECK-NEXT:    addis 3, 2, v at toc@ha
 ; CHECK-NEXT:    stfd 0, v at toc@l(3)
 ; CHECK-NEXT:    xsnegdp 0, 3
-; CHECK-NEXT:    xsmuldp 0, 0, 2
-; CHECK-NEXT:    xssubdp 1, 0, 1
+; CHECK-NEXT:    xsmsubadp 1, 0, 2
 ; CHECK-NEXT:    blr
 entry:
   %fneg = fneg double %a
   store double %fneg, ptr @v, align 8
   %fneg1 = fneg double %c
-  %mul = fmul double %fneg1, %b
-  %add = fsub double %mul, %a
+  %mul = fmul contract double %fneg1, %b
+  %add = fsub contract double %mul, %a
   ret double %add
 }
 
@@ -327,15 +323,12 @@ define dso_local double @fma_combine_const(double %a, double %b) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    addis 3, 2, .LCPI9_0 at toc@ha
 ; CHECK-NEXT:    lfd 0, .LCPI9_0 at toc@l(3)
-; CHECK-NEXT:    addis 3, 2, .LCPI9_1 at toc@ha
-; CHECK-NEXT:    xsmuldp 0, 1, 0
-; CHECK-NEXT:    lfd 1, .LCPI9_1 at toc@l(3)
-; CHECK-NEXT:    xsmaddadp 2, 0, 1
+; CHECK-NEXT:    xsmaddadp 2, 1, 0
 ; CHECK-NEXT:    fmr 1, 2
 ; CHECK-NEXT:    blr
 entry:
-  %0 = fmul double %a, 1.1
-  %1 = call contract double @llvm.fma.f64(double %0, double 2.1, double %b)
+  %0 = fmul reassoc double %a, 1.1
+  %1 = call contract reassoc double @llvm.fma.f64(double %0, double 2.1, double %b)
   ret double %1
 }
 

>From 8aa53e4470f3b7c79f7a2acb04526e4bb9d22f98 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Thu, 26 Jun 2025 12:24:00 +0800
Subject: [PATCH 2/4] remove `UnsafeFPMath` usage in
 `visitFMULForFMADistributiveCombine`

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9e6e81e2c0dee..7cf3a8f7a07b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -16737,7 +16737,7 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
 static bool isContractableFMUL(const TargetOptions &Options, SDValue N) {
   assert(N.getOpcode() == ISD::FMUL);
 
-  return Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+  return Options.AllowFPOpFusion == FPOpFusion::Fast ||
          N->getFlags().hasAllowContract();
 }
 
@@ -17338,8 +17338,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
 
   // Floating-point multiply-add with intermediate rounding. This can result
   // in a less precise result due to the changed rounding order.
-  bool HasFMAD = Options.UnsafeFPMath &&
-                 (LegalOperations && TLI.isFMADLegal(DAG, N));
+  bool HasFMAD = LegalOperations && TLI.isFMADLegal(DAG, N);
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)

>From b2778411cfa48a91c5fb68ac17b8253e3b52f2e7 Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Thu, 26 Jun 2025 15:32:15 +0800
Subject: [PATCH 3/4] remove `UnsafeFPMath` usage in visitFDIV

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |    7 +-
 llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll      | 1956 +++++++++++++++
 llvm/test/CodeGen/AMDGPU/rsq.f32.ll           | 2194 +++--------------
 llvm/test/CodeGen/NVPTX/sqrt-approx.ll        |   16 +-
 .../test/CodeGen/X86/change-unsafe-fp-math.ll |   34 +-
 llvm/test/CodeGen/X86/fdiv.ll                 |    4 +-
 6 files changed, 2358 insertions(+), 1853 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 7cf3a8f7a07b7..6c7b1499664b7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18237,8 +18237,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     // Only do the transform if the reciprocal is a legal fp immediate that
     // isn't too nasty (eg NaN, denormal, ...).
     if (((st == APFloat::opOK && !Recip.isDenormal()) ||
-         (st == APFloat::opInexact &&
-          (Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
+         (st == APFloat::opInexact && Flags.hasAllowReciprocal())) &&
         (!LegalOperations ||
          // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
          // backend)... we should handle this gracefully after Legalize.
@@ -18249,7 +18248,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
                          DAG.getConstantFP(Recip, DL, VT));
   }
 
-  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
+  if (Flags.hasAllowReciprocal()) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
@@ -18324,7 +18323,7 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
 
   // Fold X/Sqrt(X) -> Sqrt(X)
   if ((Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
-      (Options.UnsafeFPMath || Flags.hasAllowReassociation()))
+      Flags.hasAllowReassociation())
     if (N1.getOpcode() == ISD::FSQRT && N0 == N1.getOperand(0))
       return N1;
 
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
new file mode 100644
index 0000000000000..7f822c135ffb4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
@@ -0,0 +1,1956 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,SI-DAZ-SAFE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s          | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,SI-IEEE-SAFE %s
+
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,CI-DAZ-SAFE %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s          | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,CI-IEEE-SAFE %s
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+declare float @llvm.sqrt.f32(float) nounwind readnone
+declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
+
+define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
+; GCN-DAZ-SAFE-LABEL: rsq_f32:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-SAFE-NEXT:    s_endpgm
+;
+; SI-IEEE-SAFE-LABEL: rsq_f32:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
+; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
+; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-IEEE-SAFE-NEXT:    s_endpgm
+;
+; CI-IEEE-SAFE-LABEL: rsq_f32:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
+; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
+; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-IEEE-SAFE-NEXT:    s_endpgm
+  %val = load float, ptr addrspace(1) %in, align 4
+  %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !0
+  store float %div, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) {
+; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, -1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v2, v2, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v3, v1, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-DAZ-SAFE-NEXT:    s_endpgm
+;
+; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
+; SI-IEEE-SAFE-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, s0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-IEEE-SAFE-NEXT:    s_endpgm
+;
+; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
+; CI-IEEE-SAFE-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, s0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-IEEE-SAFE-NEXT:    s_endpgm
+  %sqrt = call contract float @llvm.sqrt.f32(float %val) nounwind readnone
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !0
+  store float %div, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; Recognize that this is rsqrt(a) * rcp(b) * c,
+; not 1 / ( 1 / sqrt(a)) * rcp(b) * c.
+
+; NOTE: c * rcp( sqrt(a) * b ) is generated when we move rcp generation to AMGGPUCogenPrepare.
+define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GCN-DAZ-SAFE-LABEL: rsqrt_fmul:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0
+; GCN-DAZ-SAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v2
+; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, v2, v5
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v5, v7, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, v7, v8, v7
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v8, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v7, v7, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v8, v5, v7
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GCN-DAZ-SAFE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v4
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v5, v3
+; GCN-DAZ-SAFE-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
+; GCN-DAZ-SAFE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v7, v5, v5
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, v6, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v3, v7, v6
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, v8, v5, v7
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v3, v7, v6
+; GCN-DAZ-SAFE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GCN-DAZ-SAFE-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
+; GCN-DAZ-SAFE-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
+; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-DAZ-SAFE-NEXT:    s_endpgm
+;
+; GCN-IEEE-SAFE-LABEL: rsqrt_fmul:
+; GCN-IEEE-SAFE:       ; %bb.0:
+; GCN-IEEE-SAFE-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0
+; GCN-IEEE-SAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[8:9], s[6:7]
+; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
+; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
+; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
+; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GCN-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v5, v2
+; GCN-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[0:1], -1, v5
+; GCN-IEEE-SAFE-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v5
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v9, -v7, v5, v2
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v10, -v8, v5, v2
+; GCN-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v9
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
+; GCN-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v10
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[0:1]
+; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GCN-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GCN-IEEE-SAFE-NEXT:    v_div_scale_f32 v3, s[0:1], v2, v2, v4
+; GCN-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v5, v3
+; GCN-IEEE-SAFE-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
+; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v5, v7, v5, v5
+; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v7, v6, v5
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v8, -v3, v7, v6
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v7, v8, v5, v7
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v3, v7, v6
+; GCN-IEEE-SAFE-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
+; GCN-IEEE-SAFE-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
+; GCN-IEEE-SAFE-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
+; GCN-IEEE-SAFE-NEXT:    s_endpgm
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
+  %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
+  %gep.1 = getelementptr float, ptr addrspace(1) %gep.0, i32 1
+  %gep.2 = getelementptr float, ptr addrspace(1) %gep.0, i32 2
+
+  %a = load volatile float, ptr addrspace(1) %gep.0
+  %b = load volatile float, ptr addrspace(1) %gep.1
+  %c = load volatile float, ptr addrspace(1) %gep.2
+
+  %x = call contract float @llvm.sqrt.f32(float %a)
+  %y = fmul contract float %x, %b
+  %z = fdiv contract float %c, %y
+  store float %z, ptr addrspace(1) %out.gep
+  ret void
+}
+
+define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
+; GCN-DAZ-SAFE-LABEL: neg_rsq_f32:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-SAFE-NEXT:    s_endpgm
+;
+; SI-IEEE-SAFE-LABEL: neg_rsq_f32:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
+; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
+; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-IEEE-SAFE-NEXT:    s_endpgm
+;
+; CI-IEEE-SAFE-LABEL: neg_rsq_f32:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
+; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
+; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-IEEE-SAFE-NEXT:    s_endpgm
+  %val = load float, ptr addrspace(1) %in, align 4
+  %sqrt = call contract float @llvm.sqrt.f32(float %val)
+  %div = fdiv contract float -1.0, %sqrt, !fpmath !0
+  store float %div, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
+; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0x8f800000
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-SAFE-NEXT:    s_endpgm
+;
+; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
+; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
+; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x8f800000
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-IEEE-SAFE-NEXT:    s_endpgm
+;
+; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
+; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
+; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x8f800000
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-IEEE-SAFE-NEXT:    s_endpgm
+  %val = load float, ptr addrspace(1) %in, align 4
+  %val.fneg = fneg float %val
+  %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
+  %div = fdiv contract float -1.0, %sqrt, !fpmath !0
+  store float %div, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define float @v_neg_rsq_neg_f32(float %val) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v2, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %val.fneg = fneg float %val
+  %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
+  %div = fdiv contract float -1.0, %sqrt, !fpmath !0
+  ret float %div
+}
+
+define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s5
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v3, -v0, s5
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v3, v0
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v4, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0.5, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v5, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v2, v2, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v5, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v6, v3, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v0, s7
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v0, s7
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %val.fneg = fneg <2 x float> %val
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg)
+  %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
+  ret <2 x float> %div
+}
+
+define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %val0.neg = fneg float %val0
+  %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg)
+  %div = fdiv contract float -1.0, %sqrt, !fpmath !0
+  %user = fmul contract float %div, %val1
+  ret float %user
+}
+
+define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s5
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v4, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, v1, v4
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0.5, v4
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v4, v5, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v6, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v6, v4
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v7, v4, v5
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v5, -v0, s5
+; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v5, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v0
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v6
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, v0, v5
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v4, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v7, v4
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v4, v4, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v7, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v8, v5, v4
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v6
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v6, -v0, s7
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v6, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v6, -v0, s7
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v6, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %val0.fneg = fneg <2 x float> %val0
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
+  %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
+  %user = fmul contract <2 x float> %div, %val1
+  ret <2 x float> %user
+}
+
+define float @v_neg_rsq_f32(float %val) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v2, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val)
+  %div = fdiv contract float -1.0, %sqrt, !fpmath !0
+  ret float %div
+}
+
+define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v3, v0
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v4, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0.5, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v5, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v2, v2, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v5, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v6, v3, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
+  %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
+  ret <2 x float> %div
+}
+
+define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val0)
+  %div = fdiv contract float -1.0, %sqrt, !fpmath !0
+  %user = fmul contract float %div, %val1
+  ret float %user
+}
+
+define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
+; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v4, v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, v1, v4
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0.5, v4
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v4, v5, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v6, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v5, v1
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v6, v4
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v7, v4, v5
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v0
+; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v0
+; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v6
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, v0, v5
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v4, 0.5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v7, v4
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v4, v4, v0
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v7, v5
+; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v8, v5, v4
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v6
+; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0)
+  %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
+  %user = fmul contract <2 x float> %div, %val1
+  ret <2 x float> %user
+}
+
+define float @v_rsq_f32(float %val) {
+; GCN-DAZ-LABEL: v_rsq_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-SAFE-LABEL: v_rsq_f32:
+; GCN-IEEE-SAFE:       ; %bb.0:
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  ret float %div
+}
+
+define { float, float } @v_rsq_f32_multi_use(float %val) {
+; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v1, v0
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
+  %insert.0 = insertvalue { float, float } poison, float %sqrt, 0
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  %insert.1 = insertvalue { float, float } %insert.0, float %div, 1
+  ret { float, float } %insert.1
+}
+
+define float @v_rsq_f32_missing_contract0(float %val) {
+; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  ret float %div
+}
+
+define float @v_rsq_f32_missing_contract1(float %val) {
+; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1:
+; GCN-DAZ-SAFE:       ; %bb.0:
+; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
+; SI-IEEE-SAFE:       ; %bb.0:
+; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
+; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
+; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
+; CI-IEEE-SAFE:       ; %bb.0:
+; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
+; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
+; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
+; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
+; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
+; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
+; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
+; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
+; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
+; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
+; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
+  %div = fdiv float 1.0, %sqrt, !fpmath !1
+  ret float %div
+}
+
+; Test that we contract into FMA for an fadd user after introducing
+; the fmul.
+define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
+; GCN-DAZ-LABEL: v_rsq_f32_contractable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user:
+; GCN-IEEE-SAFE:       ; %bb.0:
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  %add = fadd contract float %div, %val1
+  ret float %add
+}
+
+; Missing contract on the fdiv
+define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %val1) {
+; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract0:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
+; GCN-IEEE-SAFE:       ; %bb.0:
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
+; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  %add = fadd contract float %div, %val1
+  ret float %add
+}
+
+; Missing contract on the fadd
+define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %val1) {
+; GCN-DAZ-LABEL: v_rsq_f32_contractable_user_missing_contract1:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
+; GCN-IEEE-SAFE:       ; %bb.0:
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GCN-IEEE-SAFE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  %add = fadd float %div, %val1
+  ret float %add
+}
+
+define float @v_rsq_f32_known_never_denormal(float nofpclass(sub) %val) {
+; GCN-DAZ-LABEL: v_rsq_f32_known_never_denormal:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-LABEL: v_rsq_f32_known_never_denormal:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  ret float %div
+}
+
+define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
+; GCN-DAZ-LABEL: v_rsq_f32_known_never_posdenormal:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal:
+; GCN-IEEE-SAFE:       ; %bb.0:
+; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
+; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
+; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
+; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
+  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  ret float %div
+}
+
+!0 = !{float 2.500000e+00}
+!1 = !{float 1.000000e+00}
+
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CI-DAZ-SAFE: {{.*}}
+; SI-DAZ-SAFE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
index f4b947ade8dac..f7e0388561104 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
@@ -2,175 +2,51 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,SI-DAZ-UNSAFE %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s          | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,SI-IEEE-UNSAFE %s
 
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,SI-DAZ-SAFE %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee < %s          | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,SI-IEEE-SAFE %s
-
 
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-UNSAFE,CI-DAZ-UNSAFE %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s          | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-UNSAFE,CI-IEEE-UNSAFE %s
 
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefixes=GCN-DAZ,GCN-DAZ-SAFE,CI-DAZ-SAFE %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=hawaii -denormal-fp-math-f32=ieee < %s          | FileCheck -check-prefixes=GCN-IEEE,GCN-IEEE-SAFE,CI-IEEE-SAFE %s
-
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.sqrt.f32(float) nounwind readnone
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
 
 define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-UNSAFE-LABEL: rsq_f32:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-UNSAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-IEEE-UNSAFE-LABEL: rsq_f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-IEEE-UNSAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-IEEE-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-DAZ-SAFE-LABEL: rsq_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: rsq_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: rsq_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT:    s_endpgm
 ;
-; CI-IEEE-SAFE-LABEL: rsq_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-IEEE-LABEL: rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s6, -1
+; GCN-IEEE-NEXT:    s_mov_b32 s10, s6
+; GCN-IEEE-NEXT:    s_mov_b32 s11, s7
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s8, s2
+; GCN-IEEE-NEXT:    s_mov_b32 s9, s3
+; GCN-IEEE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, s0
+; GCN-IEEE-NEXT:    s_mov_b32 s5, s1
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT:    s_endpgm
 ; GCN-UNSAFE-LABEL: rsq_f32:
 ; GCN-UNSAFE:       ; %bb.0:
 ; GCN-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -196,131 +72,27 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(
 }
 
 define amdgpu_kernel void @rsq_f32_sgpr(ptr addrspace(1) noalias %out, float %val) {
-; GCN-DAZ-UNSAFE-LABEL: rsq_f32_sgpr:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_load_dword s2, s[4:5], 0xb
-; GCN-DAZ-UNSAFE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, s2
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s2, -1
-; GCN-DAZ-UNSAFE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GCN-DAZ-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-IEEE-UNSAFE-LABEL: rsq_f32_sgpr:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_load_dword s2, s[4:5], 0xb
-; GCN-IEEE-UNSAFE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, s2
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s2, -1
-; GCN-IEEE-UNSAFE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GCN-IEEE-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-DAZ-SAFE-LABEL: rsq_f32_sgpr:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, -1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s0, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v3, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, s0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
-;
-; CI-IEEE-SAFE-LABEL: rsq_f32_sgpr:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dword s0, s[4:5], 0xb
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v0, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, s0, v1
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, s0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v2, vcc, -1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, 1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: rsq_f32_sgpr:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN-DAZ-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s2, -1
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-DAZ-NEXT:    s_endpgm
+;
+; GCN-IEEE-LABEL: rsq_f32_sgpr:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dword s2, s[4:5], 0xb
+; GCN-IEEE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s3, 0xf000
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, s2
+; GCN-IEEE-NEXT:    s_mov_b32 s2, -1
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GCN-IEEE-NEXT:    s_endpgm
 ; GCN-UNSAFE-LABEL: rsq_f32_sgpr:
 ; GCN-UNSAFE:       ; %bb.0:
 ; GCN-UNSAFE-NEXT:    s_load_dword s2, s[0:1], 0xb
@@ -365,154 +137,53 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-UNSAFE-NEXT:    v_mul_f32_e32 v2, v4, v2
 ; GCN-UNSAFE-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
 ; GCN-UNSAFE-NEXT:    s_endpgm
-; GCN-DAZ-UNSAFE-LABEL: rsqrt_fmul:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s6, 0
-; GCN-DAZ-UNSAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GCN-DAZ-UNSAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v2, v2
-; GCN-DAZ-UNSAFE-NEXT:    v_rcp_f32_e32 v3, v3
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GCN-DAZ-UNSAFE-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
-; GCN-DAZ-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-IEEE-UNSAFE-LABEL: rsqrt_fmul:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s6, 0
-; GCN-IEEE-UNSAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b64 s[8:9], s[2:3]
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GCN-IEEE-UNSAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b64 s[4:5], s[0:1]
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v2, v2
-; GCN-IEEE-UNSAFE-NEXT:    v_rcp_f32_e32 v3, v3
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e32 v2, v4, v2
-; GCN-IEEE-UNSAFE-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
-; GCN-IEEE-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-DAZ-SAFE-LABEL: rsqrt_fmul:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0
-; GCN-DAZ-SAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, v2, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v5, v7, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, v7, v8, v7
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v8, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v7, v7, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v8, v5, v7
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_div_scale_f32 v3, s[4:5], v2, v2, v4
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v5, v3
-; GCN-DAZ-SAFE-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
-; GCN-DAZ-SAFE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v7, v5, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v7, v6, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v3, v7, v6
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, v8, v5, v7
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v3, v7, v6
-; GCN-DAZ-SAFE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GCN-DAZ-SAFE-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
-; GCN-DAZ-SAFE-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; GCN-IEEE-SAFE-LABEL: rsqrt_fmul:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0
-; GCN-IEEE-SAFE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[8:9], s[6:7]
-; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[10:11], s[2:3]
-; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v2
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GCN-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v5, v2
-; GCN-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[0:1], -1, v5
-; GCN-IEEE-SAFE-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v5
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v9, -v7, v5, v2
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v10, -v8, v5, v2
-; GCN-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v9
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[0:1]
-; GCN-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v10
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[0:1]
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v7, 0x37800000, v5
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GCN-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, v2, v3
-; GCN-IEEE-SAFE-NEXT:    v_div_scale_f32 v3, s[0:1], v2, v2, v4
-; GCN-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v5, v3
-; GCN-IEEE-SAFE-NEXT:    v_div_scale_f32 v6, vcc, v4, v2, v4
-; GCN-IEEE-SAFE-NEXT:    s_mov_b64 s[0:1], s[4:5]
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v3, v5, 1.0
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v5, v7, v5, v5
-; GCN-IEEE-SAFE-NEXT:    v_mul_f32_e32 v7, v6, v5
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v8, -v3, v7, v6
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v7, v8, v5, v7
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v3, v7, v6
-; GCN-IEEE-SAFE-NEXT:    v_div_fmas_f32 v3, v3, v5, v7
-; GCN-IEEE-SAFE-NEXT:    v_div_fixup_f32 v2, v3, v2, v4
-; GCN-IEEE-SAFE-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
-; GCN-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: rsqrt_fmul:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s6, 0
+; GCN-DAZ-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-DAZ-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-DAZ-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-DAZ-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v2, v2
+; GCN-DAZ-NEXT:    v_rcp_f32_e32 v3, v3
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GCN-DAZ-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GCN-DAZ-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-DAZ-NEXT:    s_endpgm
+;
+; GCN-IEEE-LABEL: rsqrt_fmul:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s6, 0
+; GCN-IEEE-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GCN-IEEE-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GCN-IEEE-NEXT:    buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 glc
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    buffer_load_dword v3, v[0:1], s[8:11], 0 addr64 offset:4 glc
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    buffer_load_dword v4, v[0:1], s[8:11], 0 addr64 offset:8 glc
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v2, v2
+; GCN-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v2, v2, v3
+; GCN-IEEE-NEXT:    v_mul_f32_e32 v2, v4, v2
+; GCN-IEEE-NEXT:    buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
+; GCN-IEEE-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, ptr addrspace(1) %out, i32 %tid
   %gep.0 = getelementptr float, ptr addrspace(1) %in, i32 %tid
@@ -525,167 +196,49 @@ define amdgpu_kernel void @rsqrt_fmul(ptr addrspace(1) %out, ptr addrspace(1) %i
 
   %x = call contract float @llvm.sqrt.f32(float %a)
   %y = fmul contract float %x, %b
-  %z = fdiv contract float %c, %y
+  %z = fdiv arcp contract float %c, %y
   store float %z, ptr addrspace(1) %out.gep
   ret void
 }
 
 define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-UNSAFE-LABEL: neg_rsq_f32:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-UNSAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-IEEE-UNSAFE-LABEL: neg_rsq_f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-IEEE-UNSAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-IEEE-UNSAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-IEEE-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-DAZ-SAFE-LABEL: neg_rsq_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: neg_rsq_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: neg_rsq_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT:    s_endpgm
 ;
-; CI-IEEE-SAFE-LABEL: neg_rsq_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-IEEE-LABEL: neg_rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s6, -1
+; GCN-IEEE-NEXT:    s_mov_b32 s10, s6
+; GCN-IEEE-NEXT:    s_mov_b32 s11, s7
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s8, s2
+; GCN-IEEE-NEXT:    s_mov_b32 s9, s3
+; GCN-IEEE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, s0
+; GCN-IEEE-NEXT:    s_mov_b32 s5, s1
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT:    s_endpgm
 ; GCN-UNSAFE-LABEL: neg_rsq_f32:
 ; GCN-UNSAFE:       ; %bb.0:
 ; GCN-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -712,161 +265,43 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp
 }
 
 define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-UNSAFE-LABEL: neg_rsq_neg_f32:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-UNSAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-UNSAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-IEEE-UNSAFE-LABEL: neg_rsq_neg_f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-IEEE-UNSAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-IEEE-UNSAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-IEEE-UNSAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-IEEE-UNSAFE-NEXT:    s_endpgm
-;
-; GCN-DAZ-SAFE-LABEL: neg_rsq_neg_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s2, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT:    s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[0:1]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT:    s_endpgm
-;
-; CI-IEEE-SAFE-LABEL: neg_rsq_neg_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x9
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0xf000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, -1
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s2, s6
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s3, s7
-; CI-IEEE-SAFE-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, s10
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s1, s11
-; CI-IEEE-SAFE-NEXT:    buffer_load_dword v0, off, s[0:3], 0
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s0, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v1, 0x260
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, s8
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s5, s9
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0)
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[0:1], s0, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v3, vcc, -1, v2
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
-; CI-IEEE-SAFE-NEXT:    s_endpgm
+; GCN-DAZ-LABEL: neg_rsq_neg_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT:    s_mov_b32 s6, -1
+; GCN-DAZ-NEXT:    s_mov_b32 s10, s6
+; GCN-DAZ-NEXT:    s_mov_b32 s11, s7
+; GCN-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT:    s_mov_b32 s8, s2
+; GCN-DAZ-NEXT:    s_mov_b32 s9, s3
+; GCN-DAZ-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-DAZ-NEXT:    s_mov_b32 s4, s0
+; GCN-DAZ-NEXT:    s_mov_b32 s5, s1
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-DAZ-NEXT:    s_endpgm
+;
+; GCN-IEEE-LABEL: neg_rsq_neg_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-IEEE-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-IEEE-NEXT:    s_mov_b32 s6, -1
+; GCN-IEEE-NEXT:    s_mov_b32 s10, s6
+; GCN-IEEE-NEXT:    s_mov_b32 s11, s7
+; GCN-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-IEEE-NEXT:    s_mov_b32 s8, s2
+; GCN-IEEE-NEXT:    s_mov_b32 s9, s3
+; GCN-IEEE-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GCN-IEEE-NEXT:    s_mov_b32 s4, s0
+; GCN-IEEE-NEXT:    s_mov_b32 s5, s1
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-IEEE-NEXT:    s_endpgm
 ; GCN-UNSAFE-LABEL: neg_rsq_neg_f32:
 ; GCN-UNSAFE:       ; %bb.0:
 ; GCN-UNSAFE-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
@@ -894,101 +329,19 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
 }
 
 define float @v_neg_rsq_neg_f32(float %val) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v1, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %val.fneg = fneg float %val
   %sqrt = call contract float @llvm.sqrt.f32(float %val.fneg)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -996,168 +349,23 @@ define float @v_neg_rsq_neg_f32(float %val) {
 }
 
 define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v3, -v0, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v4, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0.5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v6, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v0, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v1, -v1
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v2, -v1, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v0, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-IEEE-NEXT:    v_rsq_f32_e64 v1, -v1
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %val.fneg = fneg <2 x float> %val
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val.fneg)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
@@ -1165,104 +373,19 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
 }
 
 define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_f32_foldable_user:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_neg_f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %val0.neg = fneg float %val0
   %sqrt = call contract float @llvm.sqrt.f32(float %val0.neg)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
@@ -1271,546 +394,86 @@ define float @v_neg_rsq_neg_f32_foldable_user(float %val0, float %val1) {
 }
 
 define <2 x float> @v_neg_rsq_neg_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v0, -v0
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e64 v1, -v1
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0x8f800000
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s5, 0x4f800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v4, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, v1, v4
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, 0.5, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v4, v5, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v6, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v6, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v7, v4, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e64 v5, -v0, s5
-; GCN-DAZ-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v5, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v6, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v4, v0, v5
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0.5, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v7, -v5, v4, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v4, v7, v4
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v8, -v4, v4, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, v5, v7, v5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, v8, v5, v4
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v6
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v6, -v0, s7
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v6, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v0, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, -v1, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x8f800000
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s7, 0x4f800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v4, -v1, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v1, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v5, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v5, v4, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v6, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e64 v6, -v0, s7
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v0, -v0, v6, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v6, v0
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v5, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v7, -v4, v6, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v6
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v6, -v7, v6, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v6
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v4, -v1
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v4, v4
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v4, v1
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
-  %val0.fneg = fneg <2 x float> %val0
-  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
-  %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
-  %user = fmul contract <2 x float> %div, %val1
-  ret <2 x float> %user
-}
-
-define float @v_neg_rsq_f32(float %val) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, -v1, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v1, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v1, v4, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, -v0, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v1, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e64 v1, -v1
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-LABEL: v_neg_rsq_neg_v2f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e64 v0, -v0
+; GCN-IEEE-NEXT:    v_rsq_f32_e64 v1, -v1
+; GCN-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+  %val0.fneg = fneg <2 x float> %val0
+  %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val0.fneg)
+  %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
+  %user = fmul contract <2 x float> %div, %val1
+  ret <2 x float> %user
+}
+
+define float @v_neg_rsq_f32(float %val) {
+; GCN-DAZ-LABEL: v_neg_rsq_f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GCN-IEEE-LABEL: v_neg_rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
   ret float %div
 }
 
 define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v1
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v1, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v1
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v4, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0.5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v2, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v6, -v2, v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v5, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v6, v3, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v1, -v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v1|, s6
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_v2f32:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s6, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v1
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v1
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v4, v0
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v5, -v2, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v5, s[4:5], 1, v4
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v5, v4, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v1
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v1, vcc, 0, v1
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_v2f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-IEEE-NEXT:    v_xor_b32_e32 v1, 0x80000000, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %val)
   %div = fdiv contract <2 x float> <float -1.0, float -1.0>, %sqrt, !fpmath !0
   ret <2 x float> %div
 }
 
 define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v1
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rsq_f32_e32 v2, v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, v0, v2
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v4, -v2, v3, 0.5
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v5, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v2, v4, v2
-; GCN-DAZ-SAFE-NEXT:    v_fma_f32 v2, v5, v2, v3
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GCN-DAZ-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; GCN-DAZ-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; GCN-DAZ-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e64 v0, -v0
-; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, -v0, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_neg_rsq_f32_foldable_user:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v4, -v3, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v4, s[4:5], 1, v2
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v2, -v4, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v3, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e64 v2, -v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v2, v2
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v1
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_neg_rsq_f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0)
   %div = fdiv contract float -1.0, %sqrt, !fpmath !0
   %user = fmul contract float %div, %val1
@@ -1818,24 +481,23 @@ define float @v_neg_rsq_f32_foldable_user(float %val0, float %val1) {
 }
 
 define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float> %val1) {
-; GCN-DAZ-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
-; GCN-DAZ-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v1
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v0, -v0, v2
-; GCN-IEEE-UNSAFE-NEXT:    v_mul_f32_e64 v1, -v1, v3
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_neg_rsq_v2f32_foldable_user:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GCN-DAZ-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GCN-IEEE-LABEL: v_neg_rsq_v2f32_foldable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GCN-IEEE-NEXT:    v_mul_f32_e64 v0, -v0, v2
+; GCN-IEEE-NEXT:    v_mul_f32_e64 v1, -v1, v3
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-DAZ-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; GCN-DAZ-SAFE:       ; %bb.0:
 ; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1876,7 +538,6 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float>
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-DAZ-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; SI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; SI-IEEE-SAFE:       ; %bb.0:
 ; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1932,7 +593,6 @@ define <2 x float> @v_neg_rsq_v2f32_foldable_user(<2 x float> %val0, <2 x float>
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; CI-IEEE-SAFE-LABEL: v_neg_rsq_v2f32_foldable_user:
 ; CI-IEEE-SAFE:       ; %bb.0:
 ; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1996,12 +656,11 @@ define float @v_rsq_f32(float %val) {
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
+; GCN-IEEE-LABEL: v_rsq_f32:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-IEEE-SAFE-LABEL: v_rsq_f32:
 ; GCN-IEEE-SAFE:       ; %bb.0:
 ; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2019,29 +678,27 @@ define float @v_rsq_f32(float %val) {
 }
 
 define { float, float } @v_rsq_f32_multi_use(float %val) {
-; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_multi_use:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-DAZ-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_multi_use:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_sqrt_f32_e32 v2, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v1, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_mov_b32_e32 v0, v2
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_multi_use:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v2, v0
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-DAZ-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GCN-IEEE-LABEL: v_rsq_f32_multi_use:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_sqrt_f32_e32 v2, v0
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v0
+; GCN-IEEE-NEXT:    v_mov_b32_e32 v0, v2
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_multi_use:
 ; GCN-DAZ-SAFE:       ; %bb.0:
 ; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
 ; SI-IEEE-SAFE:       ; %bb.0:
 ; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2072,7 +729,6 @@ define { float, float } @v_rsq_f32_multi_use(float %val) {
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v2, vcc, 0, v2
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_multi_use:
 ; CI-IEEE-SAFE:       ; %bb.0:
 ; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2102,31 +758,29 @@ define { float, float } @v_rsq_f32_multi_use(float %val) {
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %insert.0 = insertvalue { float, float } poison, float %sqrt, 0
-  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  %div = fdiv arcp contract float 1.0, %sqrt, !fpmath !1
   %insert.1 = insertvalue { float, float } %insert.0, float %div, 1
   ret { float, float } %insert.1
 }
 
 define float @v_rsq_f32_missing_contract0(float %val) {
-; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract0:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_missing_contract0:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GCN-IEEE-LABEL: v_rsq_f32_missing_contract0:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract0:
 ; GCN-DAZ-SAFE:       ; %bb.0:
 ; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
 ; SI-IEEE-SAFE:       ; %bb.0:
 ; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2157,7 +811,6 @@ define float @v_rsq_f32_missing_contract0(float %val) {
 ; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
 ; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
 ; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract0:
 ; CI-IEEE-SAFE:       ; %bb.0:
 ; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2186,90 +839,24 @@ define float @v_rsq_f32_missing_contract0(float %val) {
 ; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
 ; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call float @llvm.sqrt.f32(float %val), !fpmath !1
-  %div = fdiv contract float 1.0, %sqrt, !fpmath !1
+  %div = fdiv arcp contract float 1.0, %sqrt, !fpmath !1
   ret float %div
 }
 
 define float @v_rsq_f32_missing_contract1(float %val) {
-; GCN-DAZ-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
-; GCN-DAZ-UNSAFE:       ; %bb.0:
-; GCN-DAZ-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_missing_contract1:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-DAZ-SAFE-LABEL: v_rsq_f32_missing_contract1:
-; GCN-DAZ-SAFE:       ; %bb.0:
-; GCN-DAZ-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT:    v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; SI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
-; SI-IEEE-SAFE:       ; %bb.0:
-; SI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; SI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; SI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; SI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; SI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x7f800000
-; SI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; SI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-DAZ-LABEL: v_rsq_f32_missing_contract1:
+; GCN-DAZ:       ; %bb.0:
+; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; CI-IEEE-SAFE-LABEL: v_rsq_f32_missing_contract1:
-; CI-IEEE-SAFE:       ; %bb.0:
-; CI-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0xf800000
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CI-IEEE-SAFE-NEXT:    v_sqrt_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v3, -v2, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v3
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_add_i32_e64 v3, s[4:5], 1, v1
-; CI-IEEE-SAFE-NEXT:    v_fma_f32 v1, -v3, v1, v0
-; CI-IEEE-SAFE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[4:5]
-; CI-IEEE-SAFE-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; CI-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x260
-; CI-IEEE-SAFE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v2
-; CI-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
-; CI-IEEE-SAFE-NEXT:    v_frexp_mant_f32_e32 v1, v0
-; CI-IEEE-SAFE-NEXT:    v_rcp_f32_e32 v1, v1
-; CI-IEEE-SAFE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; CI-IEEE-SAFE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
-; CI-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v1, v0
-; CI-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_missing_contract1:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
-  %div = fdiv float 1.0, %sqrt, !fpmath !1
+  %div = fdiv arcp float 1.0, %sqrt, !fpmath !1
   ret float %div
 }
 
@@ -2283,25 +870,12 @@ define float @v_rsq_f32_contractable_user(float %val0, float %val1) {
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   %add = fadd contract float %div, %val1
@@ -2317,25 +891,12 @@ define float @v_rsq_f32_contractable_user_missing_contract0(float %val0, float %
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_mov_b32_e32 v2, 0x45800000
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
-; GCN-IEEE-SAFE-NEXT:    v_fma_f32 v0, v0, v2, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract0:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   %add = fadd contract float %div, %val1
@@ -2351,25 +912,12 @@ define float @v_rsq_f32_contractable_user_missing_contract1(float %val0, float %
 ; GCN-DAZ-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v2, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; GCN-IEEE-SAFE-NEXT:    v_add_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_contractable_user_missing_contract1:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    v_add_f32_e32 v0, v0, v1
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val0), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   %add = fadd float %div, %val1
@@ -2400,23 +948,11 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
 ; GCN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
 ; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-IEEE-UNSAFE-LABEL: v_rsq_f32_known_never_posdenormal:
-; GCN-IEEE-UNSAFE:       ; %bb.0:
-; GCN-IEEE-UNSAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-UNSAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-UNSAFE-NEXT:    s_setpc_b64 s[30:31]
-;
-; GCN-IEEE-SAFE-LABEL: v_rsq_f32_known_never_posdenormal:
-; GCN-IEEE-SAFE:       ; %bb.0:
-; GCN-IEEE-SAFE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-SAFE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-SAFE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 24, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-SAFE-NEXT:    v_cndmask_b32_e64 v1, 0, 12, vcc
-; GCN-IEEE-SAFE-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; GCN-IEEE-SAFE-NEXT:    s_setpc_b64 s[30:31]
+; GCN-IEEE-LABEL: v_rsq_f32_known_never_posdenormal:
+; GCN-IEEE:       ; %bb.0:
+; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract float @llvm.sqrt.f32(float %val), !fpmath !1
   %div = fdiv contract float 1.0, %sqrt, !fpmath !1
   ret float %div
@@ -2427,9 +963,9 @@ define float @v_rsq_f32_known_never_posdenormal(float nofpclass(psub) %val) {
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CI-DAZ-SAFE: {{.*}}
 ; CI-DAZ-UNSAFE: {{.*}}
 ; CI-IEEE-UNSAFE: {{.*}}
-; SI-DAZ-SAFE: {{.*}}
+; GCN-DAZ-UNSAFE: {{.*}}
+; GCN-IEEE-UNSAFE: {{.*}}
 ; SI-DAZ-UNSAFE: {{.*}}
 ; SI-IEEE-UNSAFE: {{.*}}
diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
index a28d264cd8ec0..3989c8e32e458 100644
--- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
+++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -43,7 +43,7 @@ define float @test_rsqrt_ftz(float %a) #0 #1 {
   ret float %ret
 }
 
-define double @test_rsqrt64(double %a) #0 {
+define double @test_rsqrt64(double %a) {
 ; CHECK-LABEL: test_rsqrt64(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
@@ -54,12 +54,12 @@ define double @test_rsqrt64(double %a) #0 {
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
-  %ret = fdiv double 1.0, %val
+  %ret = fdiv arcp double 1.0, %val
   ret double %ret
 }
 
 ; There's no rsqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
-define double @test_rsqrt64_ftz(double %a) #0 #1 {
+define double @test_rsqrt64_ftz(double %a) #1 {
 ; CHECK-LABEL: test_rsqrt64_ftz(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b64 %rd<3>;
@@ -70,7 +70,7 @@ define double @test_rsqrt64_ftz(double %a) #0 #1 {
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd2;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
-  %ret = fdiv double 1.0, %val
+  %ret = fdiv arcp double 1.0, %val
   ret double %ret
 }
 
@@ -229,7 +229,7 @@ define float @test_rsqrt32_refined(float %a) #0 #2 {
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
-  %ret = fdiv float 1.0, %val
+  %ret = fdiv arcp float 1.0, %val
   ret float %ret
 }
 
@@ -284,7 +284,7 @@ define double @test_rsqrt64_refined(double %a) #0 #2 {
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
-  %ret = fdiv double 1.0, %val
+  %ret = fdiv arcp double 1.0, %val
   ret double %ret
 }
 
@@ -341,7 +341,7 @@ define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 {
 ; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %val = tail call float @llvm.sqrt.f32(float %a)
-  %ret = fdiv float 1.0, %val
+  %ret = fdiv arcp float 1.0, %val
   ret float %ret
 }
 
@@ -396,7 +396,7 @@ define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
 ; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
   %val = tail call double @llvm.sqrt.f64(double %a)
-  %ret = fdiv double 1.0, %val
+  %ret = fdiv arcp double 1.0, %val
   ret double %ret
 }
 
diff --git a/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll
index ba09ba8b6402b..2aa79fafe59a5 100644
--- a/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll
+++ b/llvm/test/CodeGen/X86/change-unsafe-fp-math.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown \
 ; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=SAFE
 
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math \
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown \
 ; RUN:   | FileCheck %s --check-prefix=CHECK --check-prefix=UNSAFE
 
 ; The div in these functions should be converted to a mul when unsafe-fp-math
@@ -12,14 +12,19 @@
 
 ; CHECK-LABEL: unsafe_fp_math_default0:
 define double @unsafe_fp_math_default0(double %x) {
-; SAFE:      divsd
 ; UNSAFE:    mulsd
+  %div = fdiv arcp double %x, 3.0
+  ret double %div
+}
+; CHECK-LABEL: safe_fp_math_default0:
+define double @safe_fp_math_default0(double %x) {
+; SAFE:      divsd
   %div = fdiv double %x, 3.0
   ret double %div
 }
 
 ; CHECK-LABEL: unsafe_fp_math_off:
-define double @unsafe_fp_math_off(double %x) #0 {
+define double @unsafe_fp_math_off(double %x) {
 ; SAFE:      divsd
 ; UNSAFE:    divsd
   %div = fdiv double %x, 3.0
@@ -29,28 +34,37 @@ define double @unsafe_fp_math_off(double %x) #0 {
 ; CHECK-LABEL: unsafe_fp_math_default1:
 define double @unsafe_fp_math_default1(double %x) {
 ; With unsafe math enabled, can change this div to a mul.
-; SAFE:      divsd
 ; UNSAFE:    mulsd
+  %div = fdiv arcp double %x, 3.0
+  ret double %div
+}
+; CHECK-LABEL: safe_fp_math_default1:
+define double @safe_fp_math_default1(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE:      divsd
   %div = fdiv double %x, 3.0
   ret double %div
 }
 
 ; CHECK-LABEL: unsafe_fp_math_on:
-define double @unsafe_fp_math_on(double %x) #1 {
+define double @unsafe_fp_math_on(double %x) {
 ; SAFE:      mulsd
 ; UNSAFE:    mulsd
-  %div = fdiv double %x, 3.0
+  %div = fdiv arcp double %x, 3.0
   ret double %div
 }
 
 ; CHECK-LABEL: unsafe_fp_math_default2:
 define double @unsafe_fp_math_default2(double %x) {
 ; With unsafe math enabled, can change this div to a mul.
-; SAFE:      divsd
 ; UNSAFE:    mulsd
+  %div = fdiv arcp double %x, 3.0
+  ret double %div
+}
+; CHECK-LABEL: safe_fp_math_default2:
+define double @safe_fp_math_default2(double %x) {
+; With unsafe math enabled, can change this div to a mul.
+; SAFE:      divsd
   %div = fdiv double %x, 3.0
   ret double %div
 }
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll
index 6d2db80a87bdf..67bad09d5dd7f 100644
--- a/llvm/test/CodeGen/X86/fdiv.ll
+++ b/llvm/test/CodeGen/X86/fdiv.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 define double @exact(double %x) {
 ; Exact division by a constant converted to multiplication.
@@ -17,7 +17,7 @@ define double @inexact(double %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
-  %div = fdiv double %x, 0x41DFFFFFFFC00000
+  %div = fdiv arcp double %x, 0x41DFFFFFFFC00000
   ret double %div
 }
 

>From 79931a4d25e66141d721727d3e22a9601aa202eb Mon Sep 17 00:00:00 2001
From: PaperChalice <liujunchang97 at outlook.com>
Date: Thu, 26 Jun 2025 15:32:42 +0800
Subject: [PATCH 4/4] preserve fast-math flags when lowering fdiv

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 15 +++++----
 llvm/test/CodeGen/NVPTX/frem.ll             | 37 ++++++++++++++++++---
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d2fafe854e9e4..9b43c6e326bf2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -2857,15 +2857,16 @@ static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG,
   SDValue X = Op->getOperand(0);
   SDValue Y = Op->getOperand(1);
   EVT Ty = Op.getValueType();
+  SDNodeFlags Flags = Op->getFlags();
 
-  SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y);
-  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div);
-  SDValue Mul =
-      DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y, SDNodeFlags::AllowContract);
-  SDValue Sub =
-      DAG.getNode(ISD::FSUB, DL, Ty, X, Mul, SDNodeFlags::AllowContract);
+  SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
+  SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
+  SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
+                            Flags | SDNodeFlags::AllowContract);
+  SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
+                            Flags | SDNodeFlags::AllowContract);
 
-  if (AllowUnsafeFPMath || Op->getFlags().hasNoInfs())
+  if (AllowUnsafeFPMath || Flags.hasNoInfs())
     return Sub;
 
   // If Y is infinite, return X
diff --git a/llvm/test/CodeGen/NVPTX/frem.ll b/llvm/test/CodeGen/NVPTX/frem.ll
index 909f2534f8219..5805aed1bebe6 100644
--- a/llvm/test/CodeGen/NVPTX/frem.ll
+++ b/llvm/test/CodeGen/NVPTX/frem.ll
@@ -222,25 +222,52 @@ define double @frem_f64_ninf(double %a, double %b) {
   ret double %r
 }
 
-define float @frem_f32_imm1(float %a) {
-; FAST-LABEL: frem_f32_imm1(
+define float @frem_f32_imm1_fast(float %a) {
+; FAST-LABEL: frem_f32_imm1_fast(
 ; FAST:       {
 ; FAST-NEXT:    .reg .b32 %r<5>;
 ; FAST-EMPTY:
 ; FAST-NEXT:  // %bb.0:
-; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_param_0];
+; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_fast_param_0];
 ; FAST-NEXT:    mul.f32 %r2, %r1, 0f3E124925;
 ; FAST-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
 ; FAST-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
 ; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
 ; FAST-NEXT:    ret;
 ;
-; NORMAL-LABEL: frem_f32_imm1(
+; NORMAL-LABEL: frem_f32_imm1_fast(
 ; NORMAL:       {
 ; NORMAL-NEXT:    .reg .b32 %r<5>;
 ; NORMAL-EMPTY:
 ; NORMAL-NEXT:  // %bb.0:
-; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_param_0];
+; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_fast_param_0];
+; NORMAL-NEXT:    mul.rn.f32 %r2, %r1, 0f3E124925;
+; NORMAL-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; NORMAL-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
+; NORMAL-NEXT:    st.param.b32 [func_retval0], %r4;
+; NORMAL-NEXT:    ret;
+  %r = frem arcp float %a, 7.0
+  ret float %r
+}
+define float @frem_f32_imm1_normal(float %a) {
+; FAST-LABEL: frem_f32_imm1_normal(
+; FAST:       {
+; FAST-NEXT:    .reg .b32 %r<5>;
+; FAST-EMPTY:
+; FAST-NEXT:  // %bb.0:
+; FAST-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_normal_param_0];
+; FAST-NEXT:    div.approx.f32 %r2, %r1, 0f40E00000;
+; FAST-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
+; FAST-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;
+; FAST-NEXT:    st.param.b32 [func_retval0], %r4;
+; FAST-NEXT:    ret;
+;
+; NORMAL-LABEL: frem_f32_imm1_normal(
+; NORMAL:       {
+; NORMAL-NEXT:    .reg .b32 %r<5>;
+; NORMAL-EMPTY:
+; NORMAL-NEXT:  // %bb.0:
+; NORMAL-NEXT:    ld.param.b32 %r1, [frem_f32_imm1_normal_param_0];
 ; NORMAL-NEXT:    div.rn.f32 %r2, %r1, 0f40E00000;
 ; NORMAL-NEXT:    cvt.rzi.f32.f32 %r3, %r2;
 ; NORMAL-NEXT:    fma.rn.f32 %r4, %r3, 0fC0E00000, %r1;



More information about the llvm-commits mailing list