[llvm-branch-commits] [llvm] AMDGPU: Stop requiring afn for f32 rsq formation (PR #172082)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Dec 12 12:55:36 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
We were checking for afn or !fpmath attached to the sqrt. We
are not trying to replace a correctly rounded rsqrt; we're replacing
the two correctly rounded operations with the contracted operation.
It's net a better precision, so contract on both instructions should
be sufficient. Both the contracted and uncontracted sequences pass
the OpenCL conformance test, with a lower maximum error contracted.
---
Patch is 123.29 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/172082.diff
3 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+6-25)
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll (+52-45)
- (modified) llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll (+507-1532)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index e45d0652a65ef..01acb60a68629 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -210,8 +210,7 @@ class AMDGPUCodeGenPrepareImpl
Value *matchFractPat(IntrinsicInst &I);
Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
- bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
- FastMathFlags SqrtFMF) const;
+ bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
FastMathFlags DivFMF, FastMathFlags SqrtFMF,
@@ -696,29 +695,11 @@ Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
return Builder.CreateFMA(Y0E, EFMA, Y0);
}
-bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
- FastMathFlags DivFMF,
+bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
FastMathFlags SqrtFMF) const {
- // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
- if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
- return false;
-
- Type *EltTy = SqrtOp->getType()->getScalarType();
- switch (EltTy->getTypeID()) {
- case Type::FloatTyID:
- // v_rsq_f32 gives 1ulp
- // Separate correctly rounded fdiv + sqrt give ~1.81 ulp.
-
- // FIXME: rsq formation should not depend on approx func or the fpmath
- // accuracy. This strictly improves precision.
- return SqrtFMF.approxFunc() || SqrtOp->getFPAccuracy() >= 1.0f;
- case Type::DoubleTyID:
- return true;
- default:
- return false;
- }
-
- llvm_unreachable("covered switch");
+ // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
+ // f64.
+ return DivFMF.allowContract() && SqrtFMF.allowContract();
}
Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
@@ -927,7 +908,7 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
DenII->hasOneUse()) {
const auto *SqrtOp = cast<FPMathOperator>(DenII);
SqrtFMF = SqrtOp->getFastMathFlags();
- if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+ if (canOptimizeWithRsq(DivFMF, SqrtFMF))
RsqOp = SqrtOp->getOperand(0);
}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 7ff86ac152feb..cc0d279fe4ec8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -1563,13 +1563,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; IEEE-GOODFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00
; IEEE-GOODFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]]
; IEEE-GOODFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-GOODFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]])
-; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = extractvalue { float, i32 } [[TMP26]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]]
-; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]])
-; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]])
+; IEEE-GOODFREXP-NEXT: [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]]
+; IEEE-GOODFREXP-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]])
+; IEEE-GOODFREXP-NEXT: [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]]
; IEEE-GOODFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
; IEEE-GOODFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
; IEEE-GOODFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]])
@@ -1644,13 +1643,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; IEEE-BADFREXP-NEXT: [[TMP25:%.*]] = select contract i1 [[TMP21]], float -4.096000e+03, float -1.000000e+00
; IEEE-BADFREXP-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fmul contract float [[TMP24]], [[TMP25]]
; IEEE-BADFREXP-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-BADFREXP-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_X_HALF_ULP]])
-; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = extractvalue { float, i32 } [[TMP26]], 0
-; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[SQRT_X_HALF_ULP]])
-; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = sub i32 0, [[TMP28]]
-; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP27]])
-; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP30]], i32 [[TMP29]])
+; IEEE-BADFREXP-NEXT: [[TMP26:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP27:%.*]] = select contract i1 [[TMP26]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP28:%.*]] = fmul contract float [[X]], [[TMP27]]
+; IEEE-BADFREXP-NEXT: [[TMP29:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP28]])
+; IEEE-BADFREXP-NEXT: [[TMP30:%.*]] = select contract i1 [[TMP26]], float 4.096000e+03, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = fmul contract float [[TMP29]], [[TMP30]]
; IEEE-BADFREXP-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
; IEEE-BADFREXP-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
; IEEE-BADFREXP-NEXT: [[TMP31:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SQRT_MISMATCH_MD1]])
@@ -1701,8 +1699,7 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; DAZ-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]])
; DAZ-NEXT: [[NEG_FDIV_OPENCL:%.*]] = fneg contract float [[TMP1]]
; DAZ-NEXT: store volatile float [[NEG_FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_X_HALF_ULP:%.*]] = call contract float @llvm.sqrt.f32(float [[X]]), !fpmath [[META1]]
-; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_X_HALF_ULP]])
+; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD0:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]])
; DAZ-NEXT: store volatile float [[FDIV_SQRT_MISMATCH_MD0]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[SQRT_MISMATCH_MD1:%.*]] = call afn float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: [[FDIV_SQRT_MISMATCH_MD1:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[SQRT_MISMATCH_MD1]])
@@ -3490,19 +3487,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
; IEEE-GOODFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; IEEE-GOODFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; IEEE-GOODFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]])
-; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractvalue { float, i32 } [[TMP5]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]]
-; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
-; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]])
-; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]]
-; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
-; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = extractvalue { float, i32 } [[TMP12]], 1
-; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
+; IEEE-GOODFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; IEEE-GOODFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; IEEE-GOODFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; IEEE-GOODFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; IEEE-GOODFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]]
+; IEEE-GOODFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]])
+; IEEE-GOODFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]]
+; IEEE-GOODFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000
+; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]]
+; IEEE-GOODFREXP-NEXT: [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]])
+; IEEE-GOODFREXP-NEXT: [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00
+; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]]
; IEEE-GOODFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
; IEEE-GOODFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
; IEEE-GOODFREXP-NEXT: [[TMP20:%.*]] = extractvalue { float, i32 } [[TMP18]], 1
@@ -3536,19 +3536,22 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
; IEEE-BADFREXP-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; IEEE-BADFREXP-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; IEEE-BADFREXP-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP1]])
-; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractvalue { float, i32 } [[TMP5]], 0
-; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP1]])
-; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = sub i32 0, [[TMP7]]
-; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
-; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP9]], i32 [[TMP8]])
-; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fneg contract float [[TMP2]]
-; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP11]])
-; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = extractvalue { float, i32 } [[TMP12]], 0
-; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP11]])
-; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = sub i32 0, [[TMP14]]
-; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP13]])
-; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = call contract float @llvm.ldexp.f32.i32(float [[TMP16]], i32 [[TMP15]])
+; IEEE-BADFREXP-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; IEEE-BADFREXP-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; IEEE-BADFREXP-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; IEEE-BADFREXP-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; IEEE-BADFREXP-NEXT: [[TMP9:%.*]] = fcmp contract olt float [[TMP5]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP14:%.*]] = select contract i1 [[TMP9]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP11:%.*]] = fmul contract float [[TMP5]], [[TMP14]]
+; IEEE-BADFREXP-NEXT: [[TMP12:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP11]])
+; IEEE-BADFREXP-NEXT: [[TMP13:%.*]] = select contract i1 [[TMP9]], float 4.096000e+03, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP10:%.*]] = fmul contract float [[TMP12]], [[TMP13]]
+; IEEE-BADFREXP-NEXT: [[TMP15:%.*]] = fcmp contract olt float [[TMP6]], 0x3810000000000000
+; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = select contract i1 [[TMP15]], float 0x4170000000000000, float 1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP41:%.*]] = fmul contract float [[TMP6]], [[TMP16]]
+; IEEE-BADFREXP-NEXT: [[TMP42:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP41]])
+; IEEE-BADFREXP-NEXT: [[TMP43:%.*]] = select contract i1 [[TMP15]], float -4.096000e+03, float -1.000000e+00
+; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = fmul contract float [[TMP42]], [[TMP43]]
; IEEE-BADFREXP-NEXT: [[TMP18:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
; IEEE-BADFREXP-NEXT: [[TMP19:%.*]] = extractvalue { float, i32 } [[TMP18]], 0
; IEEE-BADFREXP-NEXT: [[TMP20:%.*]] = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float [[TMP3]])
@@ -3582,9 +3585,13 @@ define <4 x float> @rsq_f32_vector_mixed_constant_numerator_correct_sqrt(<4 x fl
; DAZ-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[DENOM]], i64 1
; DAZ-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[DENOM]], i64 2
; DAZ-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[DENOM]], i64 3
-; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP1]])
-; DAZ-NEXT: [[TMP6:%.*]] = fneg contract float [[TMP2]]
-; DAZ-NEXT: [[TMP7:%.*]] = call contract float @llvm.amdgcn.rcp.f32(float [[TMP6]])
+; DAZ-NEXT: [[TMP31:%.*]] = extractelement <4 x float> [[ARG]], i64 0
+; DAZ-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[ARG]], i64 1
+; DAZ-NEXT: [[TMP32:%.*]] = extractelement <4 x float> [[ARG]], i64 2
+; DAZ-NEXT: [[TMP33:%.*]] = extractelement <4 x float> [[ARG]], i64 3
+; DAZ-NEXT: [[TMP5:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP31]])
+; DAZ-NEXT: [[TMP34:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[TMP6]])
+; DAZ-NEXT: [[TMP7:%.*]] = fneg contract float [[TMP34]]
; DAZ-NEXT: [[TMP8:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[TMP3]])
; DAZ-NEXT: [[TMP9:%.*]] = extractvalue { float, i32 } [[TMP8]], 0
; DAZ-NEXT: [[TMP10:%.*]] = extractvalue { float, i32 } [[TMP8]], 1
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
index 7f822c135ffb4..d9fdfb38ef344 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32-safe.ll
@@ -12,123 +12,48 @@ declare float @llvm.sqrt.f32(float) nounwind readnone
declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) nounwind readnone
define amdgpu_kernel void @rsq_f32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) {
-; GCN-DAZ-SAFE-LABEL: rsq_f32:
-; GCN-DAZ-SAFE: ; %bb.0:
-; GCN-DAZ-SAFE-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s6, -1
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s10, s6
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s11, s7
-; GCN-DAZ-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s8, s2
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s9, s3
-; GCN-DAZ-SAFE-NEXT: buffer_load_dword v0, off, s[8:11], 0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s2, 0xf800000
-; GCN-DAZ-SAFE-NEXT: v_mov_b32_e32 v2, 0x260
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s4, s0
-; GCN-DAZ-SAFE-NEXT: s_mov_b32 s5, s1
-; GCN-DAZ-SAFE-NEXT: s_waitcnt vmcnt(0)
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
-; GCN-DAZ-SAFE-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-DAZ-SAFE-NEXT: v_rsq_f32_e32 v1, v0
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, v0, v1
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v1, 0.5, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v1, v3, 0.5
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v3, v3, v4, v3
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v1, v4, v1
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v4, -v3, v3, v0
-; GCN-DAZ-SAFE-NEXT: v_fma_f32 v1, v4, v1, v3
-; GCN-DAZ-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v1
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
-; GCN-DAZ-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
-; GCN-DAZ-SAFE-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GCN-DAZ-SAFE-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-DAZ-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; GCN-DAZ-SAFE-NEXT: s_endpgm
-;
-; SI-IEEE-SAFE-LABEL: rsq_f32:
-; SI-IEEE-SAFE: ; %bb.0:
-; SI-IEEE-SAFE-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s7, 0xf000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s6, -1
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, s6
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s3, s7
-; SI-IEEE-SAFE-NEXT: s_waitcnt lgkmcnt(0)
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, s10
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s1, s11
-; SI-IEEE-SAFE-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s0, 0xf800000
-; SI-IEEE-SAFE-NEXT: v_mov_b32_e32 v1, 0x260
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s2, 0x7f800000
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s4, s8
-; SI-IEEE-SAFE-NEXT: s_mov_b32 s5, s9
-; SI-IEEE-SAFE-NEXT: s_waitcnt vmcnt(0)
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_gt_f32_e64 s[0:1], s0, v0
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_sqrt_f32_e32 v2, v0
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v3, vcc, -1, v2
-; SI-IEEE-SAFE-NEXT: v_add_i32_e32 v4, vcc, 1, v2
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v5, -v3, v2, v0
-; SI-IEEE-SAFE-NEXT: v_fma_f32 v6, -v4, v2, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_ge_f32_e32 vcc, 0, v5
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e32 vcc, 0, v6
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
-; SI-IEEE-SAFE-NEXT: v_mul_f32_e32 v3, 0x37800000, v2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[0:1]
-; SI-IEEE-SAFE-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-IEEE-SAFE-NEXT: v_frexp_mant_f32_e32 v1, v0
-; SI-IEEE-SAFE-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s2
-; SI-IEEE-SAFE-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc
-; SI-IEEE-SAFE-NEXT: v_rcp_f32_e32 v1, v1
-; SI-IEEE-SAFE-NEXT: v_frexp_exp_i32_f32_e32 v0, v0
-; SI-IEEE-SAFE-NEXT: v_sub_i32_e32 v0, vcc, 0, v0
-; SI-IEEE-SAFE-NEXT: v_ldexp_f32_e32 v0, v1, v0
-; SI-IEEE-SAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
-; SI-IEEE-SAFE-NEXT: s_endpgm
+; GCN-DAZ-LABEL: rsq_f32:
+; GCN-DAZ: ; %bb.0:
+; GCN-DAZ-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-DAZ-NEXT: s_mov_b32 s7, 0xf000
+; GCN-DAZ-NEXT: s_mov_b32 s6, -1
+; GCN-DAZ-NEXT: s_mov_b32 s10, s6
+; GCN-DAZ-NEXT: s_mov_b32 s11, s7
+; GCN-DAZ-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-DAZ-NEXT: s_mov_b32 s8, s2
+; GCN-DAZ-NEXT: s_mov_b32 s9, s3
+; GCN-DAZ-NEXT: buffer_load_dword v0, off, s[...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/172082
More information about the llvm-branch-commits
mailing list