[llvm] d33ab05 - AMDGPU: Add flag to disable fdiv processing in IR pass

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Jul 20 16:51:22 PDT 2023


Author: Matt Arsenault
Date: 2023-07-20T19:51:15-04:00
New Revision: d33ab0546777127c93dbb719bafeabc06aec5d4f

URL: https://github.com/llvm/llvm-project/commit/d33ab0546777127c93dbb719bafeabc06aec5d4f
DIFF: https://github.com/llvm/llvm-project/commit/d33ab0546777127c93dbb719bafeabc06aec5d4f.diff

LOG: AMDGPU: Add flag to disable fdiv processing in IR pass

We kind of have to have multiple implementations of fdiv split between
the two selectors with some pre-processing. Add yet another test to
check for consistency of interpretation of flag combinations. We have
quite a bit of test redundancy here already, but there are so many
possible interesting permutations it's unwieldy to cover every detail
in any one of them. We have a number of overlapping fdiv tests but
it's hard to follow everything going on as it is.

Added: 
    llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index efa0be9f1796c0..5790b9a0ac10ed 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -88,6 +88,13 @@ static cl::opt<bool> DisableIDivExpand(
   cl::ReallyHidden,
   cl::init(false));
 
+// Disable processing of fdiv so we can better test the backend implementations.
+static cl::opt<bool> DisableFDivExpand(
+  "amdgpu-codegenprepare-disable-fdiv-expansion",
+  cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(false));
+
 class AMDGPUCodeGenPrepareImpl
     : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
 public:
@@ -834,6 +841,9 @@ static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
 //
 // NOTE: rcp is the preference in cases that both are legal.
 bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
+  if (DisableFDivExpand)
+    return false;
+
   Type *Ty = FDiv.getType()->getScalarType();
   if (!Ty->isFloatTy())
     return false;

diff  --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
new file mode 100644
index 00000000000000..d954e12cbc5536
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -0,0 +1,881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; Check for consistency of interpretation of fast math flags on fdiv
+; between implementations.
+
+; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,CODEGEN-IEEE-SDAG %s
+; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,CODEGEN-IEEE-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,IR-IEEE,IR-IEEE-SDAG %s
+; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=ieee < %s | FileCheck -check-prefixes=CHECK,IEEE,IR-IEEE,IR-IEEE-GISEL %s
+
+; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,CODEGEN-DAZ,CODEGEN-DAZ-SDAG %s
+; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=0 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,CODEGEN-DAZ,CODEGEN-DAZ-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,IR-DAZ,IR-DAZ-SDAG %s
+; RUN: llc -global-isel=1 -amdgpu-codegenprepare-disable-fdiv-expansion=1 -march=amdgcn -mcpu=hawaii -denormal-fp-math=preserve-sign < %s | FileCheck -check-prefixes=CHECK,DAZ,IR-DAZ,IR-DAZ-GISEL %s
+
+define float @v_fdiv_f32(float %x, float %y) {
+; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v3, v4, v3, v3
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v5, v4, v3
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v5, v6, v3, v5
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v3, v5, v3, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v5, v4, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, v6, v3, v5
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-SDAG-LABEL: v_fdiv_f32:
+; IR-IEEE-SDAG:       ; %bb.0:
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; IR-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v3, v4, v3, v3
+; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; IR-IEEE-SDAG-NEXT:    v_mul_f32_e32 v5, v4, v3
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v5, v6, v3, v5
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-GISEL-LABEL: v_fdiv_f32:
+; IR-IEEE-GISEL:       ; %bb.0:
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v3, v5, v3, v3
+; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v5, v4, v3
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, v6, v3, v5
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAZ-LABEL: v_fdiv_f32:
+; DAZ:       ; %bb.0:
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAZ-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; DAZ-NEXT:    v_rcp_f32_e32 v3, v2
+; DAZ-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; DAZ-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; DAZ-NEXT:    v_fma_f32 v3, v5, v3, v3
+; DAZ-NEXT:    v_mul_f32_e32 v5, v4, v3
+; DAZ-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; DAZ-NEXT:    v_fma_f32 v5, v6, v3, v5
+; DAZ-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; DAZ-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; DAZ-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv float %x, %y
+  ret float %fdiv
+}
+
+define float @v_fdiv_f32_afn(float %x, float %y) {
+; CHECK-LABEL: v_fdiv_f32_afn:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv afn float %x, %y
+  ret float %fdiv
+}
+
+define float @v_fdiv_f32_arcp(float %x, float %y) {
+; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32_arcp:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v3, v4, v3, v3
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v5, v4, v3
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v5, v6, v3, v5
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32_arcp:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v3, v5, v3, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v5, v4, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, v6, v3, v5
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-SDAG-LABEL: v_fdiv_f32_arcp:
+; IR-IEEE-SDAG:       ; %bb.0:
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; IR-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v3, v4, v3, v3
+; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; IR-IEEE-SDAG-NEXT:    v_mul_f32_e32 v5, v4, v3
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v5, v6, v3, v5
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-GISEL-LABEL: v_fdiv_f32_arcp:
+; IR-IEEE-GISEL:       ; %bb.0:
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v3, v5, v3, v3
+; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v5, v4, v3
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, v6, v3, v5
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAZ-LABEL: v_fdiv_f32_arcp:
+; DAZ:       ; %bb.0:
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAZ-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; DAZ-NEXT:    v_rcp_f32_e32 v3, v2
+; DAZ-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; DAZ-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; DAZ-NEXT:    v_fma_f32 v3, v5, v3, v3
+; DAZ-NEXT:    v_mul_f32_e32 v5, v4, v3
+; DAZ-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; DAZ-NEXT:    v_fma_f32 v5, v6, v3, v5
+; DAZ-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; DAZ-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; DAZ-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv arcp float %x, %y
+  ret float %fdiv
+}
+
+define float @v_fdiv_f32_arcp_afn(float %x, float %y) {
+; CHECK-LABEL: v_fdiv_f32_arcp_afn:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv arcp afn float %x, %y
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_f32(float %x) {
+; IEEE-LABEL: v_fdiv_recip_f32:
+; IEEE:       ; %bb.0:
+; IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IEEE-NEXT:    v_rcp_f32_e32 v2, v1
+; IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IEEE-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IEEE-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IEEE-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IEEE-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAZ-LABEL: v_fdiv_recip_f32:
+; DAZ:       ; %bb.0:
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; DAZ-NEXT:    v_rcp_f32_e32 v2, v1
+; DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; DAZ-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; DAZ-NEXT:    v_fma_f32 v2, v4, v2, v2
+; DAZ-NEXT:    v_mul_f32_e32 v4, v3, v2
+; DAZ-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; DAZ-NEXT:    v_fma_f32 v4, v5, v2, v4
+; DAZ-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv float 1.0, %x
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_f32_afn(float %x) {
+; CHECK-LABEL: v_fdiv_recip_f32_afn:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv afn float 1.0, %x
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_f32_arcp(float %x) {
+; IEEE-LABEL: v_fdiv_recip_f32_arcp:
+; IEEE:       ; %bb.0:
+; IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IEEE-NEXT:    v_rcp_f32_e32 v2, v1
+; IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IEEE-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IEEE-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IEEE-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IEEE-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAZ-LABEL: v_fdiv_recip_f32_arcp:
+; DAZ:       ; %bb.0:
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; DAZ-NEXT:    v_rcp_f32_e32 v2, v1
+; DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; DAZ-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; DAZ-NEXT:    v_fma_f32 v2, v4, v2, v2
+; DAZ-NEXT:    v_mul_f32_e32 v4, v3, v2
+; DAZ-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; DAZ-NEXT:    v_fma_f32 v4, v5, v2, v4
+; DAZ-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv arcp float 1.0, %x
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_f32_arcp_afn(float %x) {
+; CHECK-LABEL: v_fdiv_recip_f32_arcp_afn:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv arcp afn float 1.0, %x
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_sqrt_f32(float %x) {
+; IEEE-LABEL: v_fdiv_recip_sqrt_f32:
+; IEEE:       ; %bb.0:
+; IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IEEE-NEXT:    v_rcp_f32_e32 v2, v1
+; IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IEEE-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IEEE-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IEEE-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IEEE-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; DAZ-LABEL: v_fdiv_recip_sqrt_f32:
+; DAZ:       ; %bb.0:
+; DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; DAZ-NEXT:    v_rcp_f32_e32 v2, v1
+; DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; DAZ-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; DAZ-NEXT:    v_fma_f32 v2, v4, v2, v2
+; DAZ-NEXT:    v_mul_f32_e32 v4, v3, v2
+; DAZ-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; DAZ-NEXT:    v_fma_f32 v4, v5, v2, v4
+; DAZ-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %x)
+  %fdiv = fdiv float 1.0, %sqrt
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_sqrt_f32_afn(float %x) {
+; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rsq_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call afn float @llvm.sqrt.f32(float %x)
+  %fdiv = fdiv afn float 1.0, %sqrt
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_sqrt_f32_arcp(float %x) {
+; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; IR-IEEE-SDAG:       ; %bb.0:
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; IR-IEEE-GISEL:       ; %bb.0:
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; CODEGEN-DAZ-SDAG:       ; %bb.0:
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; CODEGEN-DAZ-GISEL:       ; %bb.0:
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; CODEGEN-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; IR-DAZ-SDAG:       ; %bb.0:
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp:
+; IR-DAZ-GISEL:       ; %bb.0:
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-DAZ-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-DAZ-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call arcp float @llvm.sqrt.f32(float %x)
+  %fdiv = fdiv arcp float 1.0, %sqrt
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_sqrt_f32_arcp_afn(float %x) {
+; CHECK-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rsq_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call arcp afn float @llvm.sqrt.f32(float %x)
+  %fdiv = fdiv arcp afn float 1.0, %sqrt
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_sqrt_f32_arcp_fdiv_only(float %x) {
+; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; IR-IEEE-SDAG:       ; %bb.0:
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; IR-IEEE-GISEL:       ; %bb.0:
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; CODEGEN-DAZ-SDAG:       ; %bb.0:
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; CODEGEN-DAZ-GISEL:       ; %bb.0:
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; CODEGEN-DAZ-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; CODEGEN-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; CODEGEN-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-SDAG-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; IR-DAZ-SDAG:       ; %bb.0:
+; IR-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-SDAG-NEXT:    v_rsq_f32_e32 v0, v0
+; IR-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-GISEL-LABEL: v_fdiv_recip_sqrt_f32_arcp_fdiv_only:
+; IR-DAZ-GISEL:       ; %bb.0:
+; IR-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-DAZ-GISEL-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-DAZ-GISEL-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-GISEL-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-GISEL-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-GISEL-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-GISEL-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %x)
+  %fdiv = fdiv arcp float 1.0, %sqrt
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_sqrt_f32_afn_fdiv_only(float %x) {
+; CHECK-LABEL: v_fdiv_recip_sqrt_f32_afn_fdiv_only:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rsq_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %x)
+  %fdiv = fdiv afn float 1.0, %sqrt
+  ret float %fdiv
+}
+
+define float @v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only(float %x) {
+; CHECK-LABEL: v_fdiv_recip_sqrt_f32_arcp_afn_fdiv_only:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rsq_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %x)
+  %fdiv = fdiv arcp afn float 1.0, %sqrt
+  ret float %fdiv
+}
+
+define float @v_fdiv_f32_ulp25(float %x, float %y) {
+; CODEGEN-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v3, v4, v3, v3
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v5, v4, v3
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v5, v6, v3, v5
+; CODEGEN-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; CODEGEN-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_fdiv_f32_ulp25:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v3, v5, v3, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v5, v4, v3
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v5, v6, v3, v5
+; CODEGEN-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; CODEGEN-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-SDAG-LABEL: v_fdiv_f32_ulp25:
+; IR-IEEE-SDAG:       ; %bb.0:
+; IR-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; IR-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v3, v2
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v3, v4, v3, v3
+; IR-IEEE-SDAG-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; IR-IEEE-SDAG-NEXT:    v_mul_f32_e32 v5, v4, v3
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v5, v6, v3, v5
+; IR-IEEE-SDAG-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; IR-IEEE-SDAG-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; IR-IEEE-SDAG-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-GISEL-LABEL: v_fdiv_f32_ulp25:
+; IR-IEEE-GISEL:       ; %bb.0:
+; IR-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; IR-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v3, v2
+; IR-IEEE-GISEL-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v3, v5, v3, v3
+; IR-IEEE-GISEL-NEXT:    v_mul_f32_e32 v5, v4, v3
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v5, v6, v3, v5
+; IR-IEEE-GISEL-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; IR-IEEE-GISEL-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; IR-IEEE-GISEL-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-SDAG-LABEL: v_fdiv_f32_ulp25:
+; CODEGEN-DAZ-SDAG:       ; %bb.0:
+; CODEGEN-DAZ-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-SDAG-NEXT:    s_mov_b32 s4, 0x6f800000
+; CODEGEN-DAZ-SDAG-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CODEGEN-DAZ-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; CODEGEN-DAZ-SDAG-NEXT:    v_cndmask_b32_e32 v2, 1.0, v2, vcc
+; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CODEGEN-DAZ-SDAG-NEXT:    v_rcp_f32_e32 v1, v1
+; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-DAZ-SDAG-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CODEGEN-DAZ-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-GISEL-LABEL: v_fdiv_f32_ulp25:
+; CODEGEN-DAZ-GISEL:       ; %bb.0:
+; CODEGEN-DAZ-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v2, 0x6f800000
+; CODEGEN-DAZ-GISEL-NEXT:    v_mov_b32_e32 v3, 0x2f800000
+; CODEGEN-DAZ-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, v2
+; CODEGEN-DAZ-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
+; CODEGEN-DAZ-GISEL-NEXT:    v_rcp_f32_e32 v1, v1
+; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-DAZ-GISEL-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CODEGEN-DAZ-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-LABEL: v_fdiv_f32_ulp25:
+; IR-DAZ:       ; %bb.0:
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; IR-DAZ-NEXT:    v_rcp_f32_e32 v3, v2
+; IR-DAZ-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; IR-DAZ-NEXT:    v_fma_f32 v3, v5, v3, v3
+; IR-DAZ-NEXT:    v_mul_f32_e32 v5, v4, v3
+; IR-DAZ-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; IR-DAZ-NEXT:    v_fma_f32 v5, v6, v3, v5
+; IR-DAZ-NEXT:    v_fma_f32 v2, -v2, v5, v4
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; IR-DAZ-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
+; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv float %x, %y, !fpmath !0
+  ret float %fdiv
+}
+
+define float @v_fdiv_f32_afn_ulp25(float %x, float %y) {
+; CHECK-LABEL: v_fdiv_f32_afn_ulp25:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rcp_f32_e32 v1, v1
+; CHECK-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv afn float %x, %y, !fpmath !0
+  ret float %fdiv
+}
+
+define float @v_recip_f32_ulp25(float %x) {
+; CODEGEN-IEEE-SDAG-LABEL: v_recip_f32_ulp25:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x6f800000
+; CODEGEN-IEEE-SDAG-NEXT:    v_mov_b32_e32 v1, 0x2f800000
+; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
+; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_recip_f32_ulp25:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x6f800000
+; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-LABEL: v_recip_f32_ulp25:
+; IR-IEEE:       ; %bb.0:
+; IR-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-IEEE-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-IEEE-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-IEEE-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-LABEL: v_recip_f32_ulp25:
+; CODEGEN-DAZ:       ; %bb.0:
+; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-LABEL: v_recip_f32_ulp25:
+; IR-DAZ:       ; %bb.0:
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv float 1.0, %x, !fpmath !0
+  ret float %fdiv
+}
+
+define float @v_recip_f32_afn_ulp25(float %x) {
+; CHECK-LABEL: v_recip_f32_afn_ulp25:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rcp_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %fdiv = fdiv afn float 1.0, %x, !fpmath !0
+  ret float %fdiv
+}
+
+define float @v_recip_sqrt_f32_ulp25(float %x) {
+; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_ulp25:
+; CODEGEN-IEEE-SDAG:       ; %bb.0:
+; CODEGEN-IEEE-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-SDAG-NEXT:    v_sqrt_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_mov_b32 s4, 0x6f800000
+; CODEGEN-IEEE-SDAG-NEXT:    v_mov_b32_e32 v1, 0x2f800000
+; CODEGEN-IEEE-SDAG-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
+; CODEGEN-IEEE-SDAG-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-SDAG-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT:    v_mul_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_ulp25:
+; CODEGEN-IEEE-GISEL:       ; %bb.0:
+; CODEGEN-IEEE-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-IEEE-GISEL-NEXT:    v_sqrt_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v1, 0x6f800000
+; CODEGEN-IEEE-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; CODEGEN-IEEE-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; CODEGEN-IEEE-GISEL-NEXT:    v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; CODEGEN-IEEE-GISEL-NEXT:    v_mul_f32_e32 v0, v1, v0
+; CODEGEN-IEEE-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-IEEE-LABEL: v_recip_sqrt_f32_ulp25:
+; IR-IEEE:       ; %bb.0:
+; IR-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-IEEE-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-IEEE-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-IEEE-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-IEEE-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-IEEE-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-IEEE-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_ulp25:
+; CODEGEN-DAZ:       ; %bb.0:
+; CODEGEN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-NEXT:    v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; IR-DAZ-LABEL: v_recip_sqrt_f32_ulp25:
+; IR-DAZ:       ; %bb.0:
+; IR-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; IR-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; IR-DAZ-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; IR-DAZ-NEXT:    v_rcp_f32_e32 v2, v1
+; IR-DAZ-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; IR-DAZ-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
+; IR-DAZ-NEXT:    v_fma_f32 v2, v4, v2, v2
+; IR-DAZ-NEXT:    v_mul_f32_e32 v4, v3, v2
+; IR-DAZ-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; IR-DAZ-NEXT:    v_fma_f32 v4, v5, v2, v4
+; IR-DAZ-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; IR-DAZ-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; IR-DAZ-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; IR-DAZ-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; IR-DAZ-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call float @llvm.sqrt.f32(float %x), !fpmath !0
+  %fdiv = fdiv float 1.0, %sqrt, !fpmath !0
+  ret float %fdiv
+}
+
+define float @v_recip_sqrt_f32_afn_ulp25(float %x) {
+; CHECK-LABEL: v_recip_sqrt_f32_afn_ulp25:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_rsq_f32_e32 v0, v0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %sqrt = call afn float @llvm.sqrt.f32(float %x), !fpmath !0
+  %fdiv = fdiv afn float 1.0, %sqrt, !fpmath !0
+  ret float %fdiv
+}
+
+declare float @llvm.sqrt.f32(float)
+
+!0 = !{float 2.500000e+00}


        


More information about the llvm-commits mailing list