[llvm-branch-commits] [llvm] [AMDGPU] Allocate scratch space for dVGPRs for CWSR (PR #130055)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Mar 6 23:09:47 PST 2025
================
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+dynamic-vgpr < %s | FileCheck -check-prefix=CHECK %s
+
+; Make sure we use a stack pointer and allocate 112 * 4 bytes at the beginning of the stack.
+
+define amdgpu_cs void @amdgpu_cs() #0 {
+; CHECK-LABEL: amdgpu_cs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ ret void
+}
+
+define amdgpu_kernel void @kernel() #0 {
+; CHECK-LABEL: kernel:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ ret void
+}
+
+define amdgpu_cs void @with_local() #0 {
+; CHECK-LABEL: with_local:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: v_mov_b32_e32 v0, 13
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 13, ptr addrspace(5) %local
+ ret void
+}
+
+; Check that we generate s_cselect for SP if we can fit
+; the offset in an inline constant.
+define amdgpu_cs void @with_calls_inline_const() #0 {
+; CHECK-LABEL: with_calls_inline_const:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_cselect_b32 s32, 0x1d0, 16
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+; Check that we generate s_mov + s_cmovk if we can't
+; fit the offset for SP in an inline constant.
+define amdgpu_cs void @with_calls_no_inline_const() #0 {
+; CHECK-LABEL: with_calls_no_inline_const:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_movk_i32 s32, 0x100
+; CHECK-NEXT: s_cmovk_i32 s32, 0x2c0
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, i32 61, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+; We're going to limit this to 16 VGPRs, so we need to spill the rest.
+define amdgpu_cs void @with_spills(ptr addrspace(1) %p1, ptr addrspace(1) %p2) #1 {
+; CHECK-LABEL: with_spills:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:96
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_cmovk_i32 s33, 0x1c0
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:80 ; 16-byte Folded Spill
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_load_b128 v[8:11], v[0:1], off offset:112
+; CHECK-NEXT: global_load_b128 v[12:15], v[0:1], off offset:64
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:80
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:64 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:32
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:48 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:48
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:32 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16 ; 16-byte Folded Spill
+; CHECK-NEXT: global_load_b128 v[4:7], v[0:1], off offset:16
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 ; 16-byte Folded Spill
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:80 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_clause 0x2
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:96
+; CHECK-NEXT: global_store_b128 v[2:3], v[8:11], off offset:112
+; CHECK-NEXT: global_store_b128 v[2:3], v[12:15], off offset:64
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:64 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:80
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:48 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:32
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:32 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:48
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 offset:16 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off
+; CHECK-NEXT: scratch_load_b128 v[4:7], off, s33 th:TH_LOAD_LU ; 16-byte Folded Reload
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: global_store_b128 v[2:3], v[4:7], off offset:16
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %v = load <32 x i32>, ptr addrspace(1) %p1
+ store <32 x i32> %v, ptr addrspace(1) %p2
+ ret void
+}
+
+define amdgpu_cs void @realign_stack(<32 x i32> %x) #0 {
+; CHECK-LABEL: realign_stack:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_getreg_b32 s33, hwreg(HW_REG_HW_ID2, 8, 1)
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_cmp_lg_u32 0, s33
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: s_cmovk_i32 s33, 0x200
+; CHECK-NEXT: s_movk_i32 s32, 0x100
+; CHECK-NEXT: s_clause 0x7
+; CHECK-NEXT: scratch_store_b128 off, v[28:31], s33 offset:112
+; CHECK-NEXT: scratch_store_b128 off, v[24:27], s33 offset:96
+; CHECK-NEXT: scratch_store_b128 off, v[20:23], s33 offset:80
+; CHECK-NEXT: scratch_store_b128 off, v[16:19], s33 offset:64
+; CHECK-NEXT: scratch_store_b128 off, v[12:15], s33 offset:48
+; CHECK-NEXT: scratch_store_b128 off, v[8:11], s33 offset:32
+; CHECK-NEXT: scratch_store_b128 off, v[4:7], s33 offset:16
+; CHECK-NEXT: scratch_store_b128 off, v[0:3], s33
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_cmovk_i32 s32, 0x300
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %v = alloca <32 x i32>, align 128, addrspace(5)
+ store <32 x i32> %x, ptr addrspace(5) %v
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+; Non-entry functions and graphics shaders can't run on a compute queue,
+; so they don't need to worry about CWSR.
+define amdgpu_gs void @amdgpu_gs() #0 {
+; CHECK-LABEL: amdgpu_gs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: s_mov_b32 s32, 16
+; CHECK-NEXT: scratch_store_b8 off, v0, off scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_alloc_vgpr 0
+; CHECK-NEXT: s_endpgm
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+define amdgpu_gfx void @amdgpu_gfx() #0 {
+; CHECK-LABEL: amdgpu_gfx:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_expcnt 0x0
+; CHECK-NEXT: s_wait_samplecnt 0x0
+; CHECK-NEXT: s_wait_bvhcnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: s_mov_b32 s0, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b32 s1, -1
+; CHECK-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_mov_b32 exec_lo, s1
+; CHECK-NEXT: v_writelane_b32 v40, s0, 2
+; CHECK-NEXT: v_mov_b32_e32 v0, 15
+; CHECK-NEXT: s_mov_b32 s1, callee at abs32@hi
+; CHECK-NEXT: s_mov_b32 s0, callee at abs32@lo
+; CHECK-NEXT: s_add_co_i32 s32, s32, 16
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: scratch_store_b8 off, v0, s33 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
+; CHECK-NEXT: v_mov_b32_e32 v0, 0x47
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s0, v40, 2
+; CHECK-NEXT: s_or_saveexec_b32 s1, -1
+; CHECK-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_mov_b32 exec_lo, s1
+; CHECK-NEXT: s_mov_b32 s33, s0
+; CHECK-NEXT: s_wait_loadcnt 0x0
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %local = alloca i32, addrspace(5)
+ store volatile i8 15, ptr addrspace(5) %local
+ call amdgpu_gfx void @callee(i32 71)
+ ret void
+}
+
+define void @default() #0 {
+; CHECK-LABEL: default:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT: s_wait_expcnt 0x0
+; CHECK-NEXT: s_wait_samplecnt 0x0
+; CHECK-NEXT: s_wait_bvhcnt 0x0
+; CHECK-NEXT: s_wait_kmcnt 0x0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ ret void
+}
+
+declare amdgpu_gfx void @callee(i32) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="16"}
+
----------------
arsenm wrote:
Chcek interaction with frame-pointer attribute?
https://github.com/llvm/llvm-project/pull/130055
More information about the llvm-branch-commits
mailing list