[llvm] [SelectionDAG] Remove UnsafeFPMath check in `visitFADDForFMACombine` (PR #127770)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Feb 20 01:35:19 PST 2025
================
@@ -469,124 +469,29 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c
; This one asserted with -enable-no-signed-zeros-fp-math
define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 {
-; SI-SAFE-LABEL: fneg_fadd_0_f16:
-; SI-SAFE: ; %bb.0: ; %.entry
-; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
-; SI-SAFE-NEXT: v_rcp_f32_e32 v3, v2
-; SI-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
-; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; SI-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3
-; SI-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3
-; SI-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; SI-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5
-; SI-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; SI-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
-; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0
-; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc
-; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; SI-SAFE-NEXT: ; return to shader part epilog
-;
-; SI-NSZ-LABEL: fneg_fadd_0_f16:
-; SI-NSZ: ; %bb.0: ; %.entry
-; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
-; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2
-; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
-; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3
-; SI-NSZ-NEXT: v_mul_f32_e32 v5, v4, v3
-; SI-NSZ-NEXT: v_fma_f32 v6, -v2, v5, v4
-; SI-NSZ-NEXT: v_fma_f32 v5, v6, v3, v5
-; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4
-; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
-; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
-; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
-; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
-; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; SI-NSZ-NEXT: ; return to shader part epilog
-;
-; VI-SAFE-LABEL: fneg_fadd_0_f16:
-; VI-SAFE: ; %bb.0: ; %.entry
-; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1
-; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0
-; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
-; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
-; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0
-; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00
-; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
-; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; VI-SAFE-NEXT: ; return to shader part epilog
-;
-; VI-NSZ-LABEL: fneg_fadd_0_f16:
-; VI-NSZ: ; %bb.0: ; %.entry
-; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1
-; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0
-; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
-; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0
-; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00
-; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0
-; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
-; VI-NSZ-NEXT: ; return to shader part epilog
-;
-; GFX11-SAFE-LABEL: fneg_fadd_0_f16:
-; GFX11-SAFE: ; %bb.0: ; %.entry
-; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0
-; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
-; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0
-; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo
-; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-SAFE-NEXT: ; return to shader part epilog
-;
-; GFX11-NSZ-LABEL: fneg_fadd_0_f16:
-; GFX11-NSZ: ; %bb.0: ; %.entry
-; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1
-; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1
-; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0
-; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo
-; GFX11-NSZ-NEXT: ; return to shader part epilog
+; SI-LABEL: fneg_fadd_0_f16:
+; SI: ; %bb.0: ; %.entry
+; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: ; return to shader part epilog
+;
+; VI-LABEL: fneg_fadd_0_f16:
+; VI: ; %bb.0: ; %.entry
+; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: ; return to shader part epilog
+;
+; GFX11-LABEL: fneg_fadd_0_f16:
+; GFX11: ; %bb.0: ; %.entry
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-NEXT: ; return to shader part epilog
.entry:
- %tmp7 = fdiv half 1.000000e+00, %tmp6
- %tmp8 = fmul half 0.000000e+00, %tmp7
- %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8
- %.i188 = fadd half %tmp9, 0.000000e+00
- %tmp10 = fcmp uge half %.i188, %tmp2
- %tmp11 = fneg half %.i188
+ %tmp7 = fdiv fast half 1.000000e+00, %tmp6
+ %tmp8 = fmul fast half 0.000000e+00, %tmp7
+ %tmp9 = fmul fast reassoc nnan arcp contract half 0.000000e+00, %tmp8
+ %.i188 = fadd fast half %tmp9, 0.000000e+00
+ %tmp10 = fcmp fast uge half %.i188, %tmp2
+ %tmp11 = fneg fast half %.i188
%.i092 = select i1 %tmp10, half %tmp2, half %tmp11
- %tmp12 = fcmp ule half %.i092, 0.000000e+00
+ %tmp12 = fcmp fast ule half %.i092, 0.000000e+00
%.i198 = select i1 %tmp12, half 0.000000e+00, half 0x7FF8000000000000
----------------
paperchalice wrote:
Looked a little deeper in dag combiner, `"no-signed-zeros-fp-math"` optimized `fadd` firstly and prevented further optimization.
https://github.com/llvm/llvm-project/blob/55fa2fa3480928d2ce6e06336842f673ac92f9d3/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp#L16984-L16988
https://github.com/llvm/llvm-project/pull/127770
More information about the llvm-commits
mailing list