[llvm] [AMDGPU] Add support for GFX12 expert scheduling mode 2 (PR #170319)

Nicolai Hähnle via llvm-commits llvm-commits at lists.llvm.org
Fri Dec 12 13:50:59 PST 2025


================
@@ -483,6 +603,51 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
 ; GFX12-SPREFETCH-NEXT:  .LBB4_3: ; %for.end
 ; GFX12-SPREFETCH-NEXT:    s_endpgm
 ;
+; GFX12ES2-SPREFETCH-LABEL: copy_flat_divergent:
+; GFX12ES2-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12ES2-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_SCHED_MODE, 0, 2), 2
+; GFX12ES2-SPREFETCH-NEXT:    s_load_b32 s0, s[4:5], 0x34
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX12ES2-SPREFETCH-NEXT:    s_cbranch_scc1 .LBB4_3
+; GFX12ES2-SPREFETCH-NEXT:  ; %bb.1: ; %for.body.preheader
+; GFX12ES2-SPREFETCH-NEXT:    s_load_b128 s[4:7], s[4:5], 0x24
+; GFX12ES2-SPREFETCH-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12ES2-SPREFETCH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12ES2-SPREFETCH-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_kmcnt 0x0
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_u32 v2, s1, s6, v0
+; GFX12ES2-SPREFETCH-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v3, null, s7, 0, s1
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_u32 v0, s1, s4, v0
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_u32 v2, vcc_lo, 0xb0, v2
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_alu depctr_va_sdst(0)
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v1, null, s5, 0, s1
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT:  .LBB4_2: ; %for.body
+; GFX12ES2-SPREFETCH-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12ES2-SPREFETCH-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_u32 v4, vcc_lo, 0xffffff50, v2
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v5, null, -1, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_u32 v2, vcc_lo, v2, 16
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_alu depctr_va_vdst(2)
+; GFX12ES2-SPREFETCH-NEXT:    flat_load_b128 v[4:7], v[4:5]
+; GFX12ES2-SPREFETCH-NEXT:    s_add_co_i32 s0, s0, -1
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12ES2-SPREFETCH-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12ES2-SPREFETCH-NEXT:    flat_store_b128 v[0:1], v[4:7]
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_alu depctr_vm_vsrc(0)
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 16
+; GFX12ES2-SPREFETCH-NEXT:    s_wait_alu depctr_va_vcc(0)
+; GFX12ES2-SPREFETCH-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
----------------
nhaehnle wrote:

The wait for va_vcc(0) really hurts (since vcc_lo can be forwarded here directly).

https://github.com/llvm/llvm-project/pull/170319


More information about the llvm-commits mailing list