[llvm] 701c4ad - Check for denormal flushing when selecting V_FMA/MAD_MIX*
Mateja Marjanovic via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 13 06:28:35 PDT 2023
Author: Mateja Marjanovic
Date: 2023-07-13T15:26:20+02:00
New Revision: 701c4adceae5948b88f59b5190e9ac26d9260534
URL: https://github.com/llvm/llvm-project/commit/701c4adceae5948b88f59b5190e9ac26d9260534
DIFF: https://github.com/llvm/llvm-project/commit/701c4adceae5948b88f59b5190e9ac26d9260534.diff
LOG: Check for denormal flushing when selecting V_FMA/MAD_MIX*
Added:
Modified:
llvm/lib/Target/AMDGPU/VOP3PInstructions.td
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 422d02e237621d..4910d43262feff 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -228,7 +228,7 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
>;
}
-let SubtargetPredicate = HasMadMixInsts in {
+let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in {
// These are VOP3a-like opcodes which accept no omod.
// Size of src arguments (16/32) is controlled by op_sel.
@@ -248,7 +248,7 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
}
defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
-} // End SubtargetPredicate = HasMadMixInsts
+} // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals]
// Essentially the same as the mad_mix versions
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 572503032e3562..988bea116c8fbe 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -56,6 +56,53 @@ define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
ret half %cvt.result
}
+define half @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) {
+; GFX1100-LABEL: mixlo_simpl_no_flush:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: mixlo_simpl_no_flush:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: mixlo_simpl_no_flush:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: mixlo_simpl_no_flush:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-NEXT: v_add_f32_e32 v0, v0, v2
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-CI-LABEL: mixlo_simpl_no_flush:
+; SDAG-CI: ; %bb.0:
+; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-CI-LABEL: mixlo_simpl_no_flush:
+; GISEL-CI: ; %bb.0:
+; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
+ %result = call float @llvm.fmuladd.f32(float %src0, float %src1, float %src2)
+ %cvt.result = fptrunc float %result to half
+ ret half %cvt.result
+}
+
define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2) #0 {
; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
; GFX1100: ; %bb.0:
@@ -110,6 +157,65 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src
ret half %cvt.result
}
+define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush(half %src0, half %src1, half %src2) {
+; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
+; GFX1100: ; %bb.0:
+; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1100-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX900-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX900-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX900-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX900-NEXT: v_fma_f32 v0, v0, v1, v2
+; GFX900-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
+; GFX906: ; %bb.0:
+; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT: v_fma_mixlo_f16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
+; GFX906-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; VI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; VI-NEXT: v_mul_f32_e32 v0, v0, v1
+; VI-NEXT: v_add_f32_e32 v0, v0, v2
+; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
+; SDAG-CI: ; %bb.0:
+; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
+; GISEL-CI: ; %bb.0:
+; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_fma_f32 v0, v0, v1, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
+ %src0.ext = fpext half %src0 to float
+ %src1.ext = fpext half %src1 to float
+ %src2.ext = fpext half %src2 to float
+ %result = tail call float @llvm.fmuladd.f32(float %src0.ext, float %src1.ext, float %src2.ext)
+ %cvt.result = fptrunc float %result to half
+ ret half %cvt.result
+}
+
define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2) #0 {
; GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
; GFX1100: ; %bb.0:
More information about the llvm-commits
mailing list