[llvm] a738bdf - AMDGPU: Permit more rsq formation in AMDGPUCodeGenPrepare
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 23 17:06:55 PDT 2023
Author: Matt Arsenault
Date: 2023-08-23T20:06:50-04:00
New Revision: a738bdf35eaa3bb48b7d8022e2aaa352cff909ba
URL: https://github.com/llvm/llvm-project/commit/a738bdf35eaa3bb48b7d8022e2aaa352cff909ba
DIFF: https://github.com/llvm/llvm-project/commit/a738bdf35eaa3bb48b7d8022e2aaa352cff909ba.diff
LOG: AMDGPU: Permit more rsq formation in AMDGPUCodeGenPrepare
We were basing the defer the fast case to codegen based on the fdiv
itself, and not looking for a foldable sqrt input.
https://reviews.llvm.org/D158127
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
llvm/test/CodeGen/AMDGPU/rsq.f32.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 8eda801d010966..0467b9bba8ae1e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -914,7 +914,7 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
DivFMF |= SqrtFMF;
Builder.setFastMathFlags(DivFMF);
- if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
+ if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
canIgnoreDenormalInput(Den, CtxI)) {
Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
// -1.0 / sqrt(x) -> fneg(rsq(x))
@@ -1078,6 +1078,21 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
const FastMathFlags DivFMF = FPOp->getFastMathFlags();
const float ReqdAccuracy = FPOp->getFPAccuracy();
+ FastMathFlags SqrtFMF;
+
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
+
+ Value *RsqOp = nullptr;
+ auto *DenII = dyn_cast<IntrinsicInst>(Den);
+ if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
+ DenII->hasOneUse()) {
+ const auto *SqrtOp = cast<FPMathOperator>(DenII);
+ SqrtFMF = SqrtOp->getFastMathFlags();
+ if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+ RsqOp = SqrtOp->getOperand(0);
+ }
+
// Inaccurate rcp is allowed with unsafe-fp-math or afn.
//
// Defer to codegen to handle this.
@@ -1088,28 +1103,13 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
// don't need any pre-consideration here when we have better information. A
// more conservative interpretation could use handling here.
const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
- if (AllowInaccurateRcp)
+ if (!RsqOp && AllowInaccurateRcp)
return false;
// Defer the correct implementations to codegen.
if (ReqdAccuracy < 1.0f)
return false;
- FastMathFlags SqrtFMF;
-
- Value *Num = FDiv.getOperand(0);
- Value *Den = FDiv.getOperand(1);
-
- Value *RsqOp = nullptr;
- auto *DenII = dyn_cast<IntrinsicInst>(Den);
- if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
- DenII->hasOneUse()) {
- const auto *SqrtOp = cast<FPMathOperator>(DenII);
- SqrtFMF = SqrtOp->getFastMathFlags();
- if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
- RsqOp = SqrtOp->getOperand(0);
- }
-
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
Builder.setFastMathFlags(DivFMF);
Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index 71f33e4c71488f..fd6abce7326a20 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -1542,14 +1542,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; IEEE-GOODFREXP-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]])
; IEEE-GOODFREXP-NEXT: [[AFN_NO_MD:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_NO_MD]]
; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-GOODFREXP-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
-; IEEE-GOODFREXP-NEXT: [[AFN_25ULP:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_25ULP]], !fpmath !0
+; IEEE-GOODFREXP-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]])
; IEEE-GOODFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-GOODFREXP-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]])
; IEEE-GOODFREXP-NEXT: [[FAST_NO_MD:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_NO_MD]]
; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-GOODFREXP-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0
-; IEEE-GOODFREXP-NEXT: [[FAST_25ULP:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_25ULP]], !fpmath !0
+; IEEE-GOODFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]])
; IEEE-GOODFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-GOODFREXP-NEXT: [[TMP16:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
; IEEE-GOODFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP16]], float 0x4170000000000000, float 1.000000e+00
@@ -1620,14 +1618,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; IEEE-BADFREXP-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]])
; IEEE-BADFREXP-NEXT: [[AFN_NO_MD:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_NO_MD]]
; IEEE-BADFREXP-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-BADFREXP-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
-; IEEE-BADFREXP-NEXT: [[AFN_25ULP:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_25ULP]], !fpmath !0
+; IEEE-BADFREXP-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]])
; IEEE-BADFREXP-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-BADFREXP-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]])
; IEEE-BADFREXP-NEXT: [[FAST_NO_MD:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_NO_MD]]
; IEEE-BADFREXP-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; IEEE-BADFREXP-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0
-; IEEE-BADFREXP-NEXT: [[FAST_25ULP:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_25ULP]], !fpmath !0
+; IEEE-BADFREXP-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]])
; IEEE-BADFREXP-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4
; IEEE-BADFREXP-NEXT: [[TMP16:%.*]] = fcmp contract olt float [[X]], 0x3810000000000000
; IEEE-BADFREXP-NEXT: [[TMP17:%.*]] = select contract i1 [[TMP16]], float 0x4170000000000000, float 1.000000e+00
@@ -1683,14 +1679,12 @@ define amdgpu_kernel void @rsq_f32_fpmath(ptr addrspace(1) %out, float %x) {
; DAZ-NEXT: [[SQRT_X_AFN_NO_MD:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: [[AFN_NO_MD:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_NO_MD]]
; DAZ-NEXT: store volatile float [[AFN_NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_X_AFN_25ULP:%.*]] = call contract afn float @llvm.sqrt.f32(float [[X]]), !fpmath !0
-; DAZ-NEXT: [[AFN_25ULP:%.*]] = fdiv contract afn float 1.000000e+00, [[SQRT_X_AFN_25ULP]], !fpmath !0
+; DAZ-NEXT: [[AFN_25ULP:%.*]] = call contract afn float @llvm.amdgcn.rsq.f32(float [[X]])
; DAZ-NEXT: store volatile float [[AFN_25ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[SQRT_X_FAST_NO_MD:%.*]] = call fast float @llvm.sqrt.f32(float [[X]])
; DAZ-NEXT: [[FAST_NO_MD:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_NO_MD]]
; DAZ-NEXT: store volatile float [[FAST_NO_MD]], ptr addrspace(1) [[OUT]], align 4
-; DAZ-NEXT: [[SQRT_X_FAST_25ULP:%.*]] = call fast float @llvm.sqrt.f32(float [[X]]), !fpmath !0
-; DAZ-NEXT: [[FAST_25ULP:%.*]] = fdiv fast float 1.000000e+00, [[SQRT_X_FAST_25ULP]], !fpmath !0
+; DAZ-NEXT: [[FAST_25ULP:%.*]] = call fast float @llvm.amdgcn.rsq.f32(float [[X]])
; DAZ-NEXT: store volatile float [[FAST_25ULP]], ptr addrspace(1) [[OUT]], align 4
; DAZ-NEXT: [[FDIV_OPENCL:%.*]] = call contract float @llvm.amdgcn.rsq.f32(float [[X]])
; DAZ-NEXT: store volatile float [[FDIV_OPENCL]], ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
index b9d11619001e66..d7611f3e9023b0 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv_flags.f32.ll
@@ -1035,8 +1035,7 @@ define float @v_recip_sqrt_f32_afn_ulp25_contract(float %x) {
; CODEGEN-IEEE-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
; CODEGEN-IEEE-SDAG: ; %bb.0:
; CODEGEN-IEEE-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CODEGEN-IEEE-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
-; CODEGEN-IEEE-SDAG-NEXT: v_rcp_f32_e32 v0, v0
+; CODEGEN-IEEE-SDAG-NEXT: v_rsq_f32_e32 v0, v0
; CODEGEN-IEEE-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; CODEGEN-IEEE-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
@@ -1058,18 +1057,11 @@ define float @v_recip_sqrt_f32_afn_ulp25_contract(float %x) {
; IR-IEEE-GISEL-NEXT: v_rsq_f32_e32 v0, v0
; IR-IEEE-GISEL-NEXT: s_setpc_b64 s[30:31]
;
-; CODEGEN-DAZ-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
-; CODEGEN-DAZ-SDAG: ; %bb.0:
-; CODEGEN-DAZ-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CODEGEN-DAZ-SDAG-NEXT: v_sqrt_f32_e32 v0, v0
-; CODEGEN-DAZ-SDAG-NEXT: v_rcp_f32_e32 v0, v0
-; CODEGEN-DAZ-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; CODEGEN-DAZ-GISEL-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
-; CODEGEN-DAZ-GISEL: ; %bb.0:
-; CODEGEN-DAZ-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CODEGEN-DAZ-GISEL-NEXT: v_rsq_f32_e32 v0, v0
-; CODEGEN-DAZ-GISEL-NEXT: s_setpc_b64 s[30:31]
+; CODEGEN-DAZ-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
+; CODEGEN-DAZ: ; %bb.0:
+; CODEGEN-DAZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CODEGEN-DAZ-NEXT: v_rsq_f32_e32 v0, v0
+; CODEGEN-DAZ-NEXT: s_setpc_b64 s[30:31]
;
; IR-DAZ-SDAG-LABEL: v_recip_sqrt_f32_afn_ulp25_contract:
; IR-DAZ-SDAG: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
index b37b3a5eb7fda3..f202f7adc203e6 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f32.ll
@@ -406,7 +406,7 @@ define amdgpu_kernel void @neg_rsq_f32(ptr addrspace(1) noalias %out, ptr addrsp
; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-DAZ-UNSAFE-NEXT: s_endpgm
;
@@ -537,7 +537,7 @@ define amdgpu_kernel void @neg_rsq_neg_f32(ptr addrspace(1) noalias %out, ptr ad
; GCN-DAZ-UNSAFE-NEXT: s_mov_b32 s5, s1
; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0)
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-DAZ-UNSAFE-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GCN-DAZ-UNSAFE-NEXT: s_endpgm
;
@@ -658,7 +658,7 @@ define float @v_neg_rsq_neg_f32(float %val) {
; GCN-DAZ-UNSAFE: ; %bb.0:
; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_f32:
@@ -711,8 +711,8 @@ define <2 x float> @v_neg_rsq_neg_v2f32(<2 x float> %val) {
; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v0, -v0
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e64 v1, -v1
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_neg_v2f32:
@@ -917,7 +917,7 @@ define float @v_neg_rsq_f32(float %val) {
; GCN-DAZ-UNSAFE: ; %bb.0:
; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_f32:
@@ -969,8 +969,8 @@ define <2 x float> @v_neg_rsq_v2f32(<2 x float> %val) {
; GCN-DAZ-UNSAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v0, v0
; GCN-DAZ-UNSAFE-NEXT: v_rsq_f32_e32 v1, v1
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v0, 0x80000000, v0
-; GCN-DAZ-UNSAFE-NEXT: v_sub_f32_e32 v1, 0x80000000, v1
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-DAZ-UNSAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v1
; GCN-DAZ-UNSAFE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-IEEE-UNSAFE-LABEL: v_neg_rsq_v2f32:
More information about the llvm-commits
mailing list