[llvm] [AMDGPU][SIInsertWaitcnts] Do not add s_waitcnt when the counters are known to be 0 already (PR #72830)

Mon Dec 18 02:17:02 PST 2023

jayfoad wrote:

Thanks for the test case! This is a great example of why we should not promote soft waitcnts in applyPreexistingWaitcnt.

The full codegen looks like this:
```
; %bb.0:
        s_mov_b32 s4, exec_lo
        s_wqm_b32 exec_lo, exec_lo
        v_mov_b32_e32 v7, v2
        v_ashrrev_i32_e32 v4, 31, v3
        v_mov_b32_e32 v6, v1
        v_mov_b32_e32 v5, v0
        v_ashrrev_i32_e32 v8, 31, v7
        v_lshlrev_b64 v[2:3], 4, v[3:4]
        v_lshlrev_b64 v[0:1], 5, v[7:8]
        v_add_co_u32 v0, vcc_lo, s0, v0
        v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
        v_add_co_u32 v2, vcc_lo, s2, v2
        v_add_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo
        s_clause 0x1
        global_load_dwordx4 v[11:14], v[0:1], off offset:16
        global_load_dwordx4 v[7:10], v[0:1], off
        global_load_dwordx4 v[15:18], v[2:3], off
        s_mov_b32 s0, exec_lo
.LBB0_1:                                ; =>This Inner Loop Header: Depth=1
        v_readfirstlane_b32 s1, v5
        v_readfirstlane_b32 s2, v6
        v_cmp_eq_u32_e64 s1, s1, v5
        v_cmp_eq_u32_e64 s2, s2, v6
        s_and_b32 s1, s1, s2
        s_and_saveexec_b32 s1, s1
        s_waitcnt vmcnt(0)
        v_mov_b32_e32 v0, 0
        s_waitcnt vmcnt(1)
        v_readfirstlane_b32 s8, v7
        v_readfirstlane_b32 s9, v8
        v_readfirstlane_b32 s10, v9
        v_readfirstlane_b32 s11, v10
        v_readfirstlane_b32 s12, v11
        v_readfirstlane_b32 s13, v12
        v_readfirstlane_b32 s14, v13
        v_readfirstlane_b32 s15, v14
        s_waitcnt vmcnt(0)
        v_readfirstlane_b32 s16, v15
        v_readfirstlane_b32 s17, v16
        v_readfirstlane_b32 s18, v17
        v_readfirstlane_b32 s19, v18
        v_mov_b32_e32 v1, v0
                                        ; implicit-def: $vgpr5
                                        ; implicit-def: $vgpr6
                                        ; implicit-def: $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14
                                        ; implicit-def: $vgpr15_vgpr16_vgpr17_vgpr18
        image_sample v[0:3], v[0:1], s[8:15], s[16:19] dmask:0xf dim:SQ_RSRC_IMG_2D
        s_xor_b32 exec_lo, exec_lo, s1
        s_cbranch_execnz .LBB0_1
; %bb.2:
        s_mov_b32 exec_lo, s0
        s_and_b32 exec_lo, exec_lo, s4
        s_waitcnt vmcnt(0)
        ; return to shader part epilog
```
The first time we visit LBB0_1, we have only seen the bb.0 predecessor so we only know about waits due to the global_load_dwordx4, and we insert the "s_waitcnt vmcnt(1)" and "s_waitcnt vmcnt(0)" that you highlighted as redundant.

The second time we visit LBB0_1, we have seen the image_sample instruction in the LBB0_1 predecessor, and we insert the "s_waitcnt vmcnt(0)" before the "v_mov_b32_e32 v0, 0". This renders the other two waitcnts redundant, but we can't remove them because we have already promoted them to "hard" waitcnts.

https://github.com/llvm/llvm-project/pull/72830