[llvm] 04185f0 - AMDGPU: Fix broken denormal constant folding of canonicalize
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Jul 17 17:00:40 PDT 2023
Author: Matt Arsenault
Date: 2023-07-17T19:54:20-04:00
New Revision: 04185f0b0b52a906b67c96b654fd35f25f18328d
URL: https://github.com/llvm/llvm-project/commit/04185f0b0b52a906b67c96b654fd35f25f18328d
DIFF: https://github.com/llvm/llvm-project/commit/04185f0b0b52a906b67c96b654fd35f25f18328d.diff
LOG: AMDGPU: Fix broken denormal constant folding of canonicalize
This needs to consider the dynamic denormal mode. It should be
possible to implement a runtime DAZ check with a canonicalize.
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8ecbe01b24aba9..34bca1394ec74a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10856,10 +10856,15 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
return true;
if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
- auto F = CFP->getValueAPF();
+ const auto &F = CFP->getValueAPF();
if (F.isNaN() && F.isSignaling())
return false;
- return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
+ if (!F.isDenormal())
+ return true;
+
+ DenormalMode Mode =
+ DAG.getMachineFunction().getDenormalMode(F.getSemantics());
+ return Mode == DenormalMode::getIEEE();
}
// If source is a result of another standard FP operation it is already in
@@ -10928,6 +10933,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// snans will be quieted, so we only need to worry about denormals.
if (Subtarget->supportsMinMaxDenormModes() ||
+ // FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(DAG, Op.getValueType()))
return true;
@@ -11007,6 +11013,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
[[fallthrough]];
}
default:
+ // FIXME: denormalsEnabledForType is broken for dynamic
return denormalsEnabledForType(DAG, Op.getValueType()) &&
DAG.isKnownNeverSNaN(Op);
}
@@ -11028,8 +11035,11 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
if (FCR->Value.isSignaling())
return false;
- return !FCR->Value.isDenormal() ||
- denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
+ if (!FCR->Value.isDenormal())
+ return true;
+
+ DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
+ return Mode == DenormalMode::getIEEE();
}
if (MaxDepth == 0)
@@ -11072,6 +11082,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE: {
if (Subtarget->supportsMinMaxDenormModes() ||
+ // FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(MRI.getType(Reg), MF))
return true;
@@ -11128,9 +11139,16 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
// Flush denormals to 0 if not enabled.
- if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) {
- return DAG.getConstantFP(APFloat::getZero(C.getSemantics(),
- C.isNegative()), SL, VT);
+ if (C.isDenormal()) {
+ DenormalMode Mode =
+ DAG.getMachineFunction().getDenormalMode(C.getSemantics());
+ if (Mode == DenormalMode::getPreserveSign()) {
+ return DAG.getConstantFP(
+ APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
+ }
+
+ if (Mode != DenormalMode::getIEEE())
+ return SDValue();
}
if (C.isNaN()) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
index 4473904195dccc..ee0e83c5e07632 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
@@ -36,6 +36,7 @@ body: |
$vgpr0 = COPY %1(s32)
...
+# FIXME: Mode fields are redundant and not considered.
---
name: test_denormal_fconstant
tracksRegLiveness: true
@@ -49,8 +50,7 @@ body: |
; CHECK-LABEL: name: test_denormal_fconstant
; CHECK: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 1.618950e-319
- ; CHECK-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[C]]
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[FCANONICALIZE]](s64)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[C]](s64)
%0:_(s64) = G_FCONSTANT double 0x0000000000008000
%1:_(s64) = G_FCANONICALIZE %0
$vgpr0_vgpr1 = COPY %1(s64)
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 29670f672efa31..c81a36d124e589 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -465,6 +465,114 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
ret void
}
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic(ptr addrspace(1) %out) #5 {
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: s_mov_b32 s2, 0x7fffff
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_max_f32_e64 v1, s2, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
+ store float %canonicalized, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out(ptr addrspace(1) %out) #6 {
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: s_mov_b32 s2, 0x7fffff
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_max_f32_e64 v1, s2, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
+ store float %canonicalized, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in(ptr addrspace(1) %out) #7 {
+; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
+; GFX678: ; %bb.0:
+; GFX678-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_waitcnt lgkmcnt(0)
+; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mov_b32_e32 v1, s1
+; GFX678-NEXT: flat_store_dword v[0:1], v2
+; GFX678-NEXT: s_endpgm
+;
+; GFX9-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT: s_mov_b32 s2, 0x7fffff
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_max_f32_e64 v1, s2, s2
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT: s_endpgm
+;
+; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: v_max_f32_e64 v1, 0x7fffff, 0x7fffff
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT: s_endpgm
+ %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
+ store float %canonicalized, ptr addrspace(1) %out
+ ret void
+}
+
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr addrspace(1) %out) #3 {
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
@@ -2400,3 +2508,6 @@ attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign"
attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" }
attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
+attributes #5 = { nounwind "denormal-fp-math-f32"="dynamic,dynamic" }
+attributes #6 = { nounwind "denormal-fp-math-f32"="dynamic,ieee" }
+attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,dynamic" }
More information about the llvm-commits
mailing list