[llvm] [AMDGPU] Support alloca in AS0 (PR #136584)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Wed Apr 23 06:13:02 PDT 2025


================
@@ -0,0 +1,417 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -O0 %s -o - | FileCheck %s --check-prefix=ISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -global-isel -O0 %s -o - | FileCheck %s --check-prefix=GI
+
+declare void @bar(ptr)
+
+define i32 @static_alloca() {
+; ISEL-LABEL: static_alloca:
+; ISEL:       ; %bb.0:
+; ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISEL-NEXT:    s_mov_b32 s16, s33
+; ISEL-NEXT:    s_mov_b32 s33, s32
+; ISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; ISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[18:19]
+; ISEL-NEXT:    v_writelane_b32 v40, s16, 2
+; ISEL-NEXT:    s_add_i32 s32, s32, 0x400
+; ISEL-NEXT:    v_writelane_b32 v40, s30, 0
+; ISEL-NEXT:    v_writelane_b32 v40, s31, 1
+; ISEL-NEXT:    s_mov_b32 s18, 32
+; ISEL-NEXT:    s_mov_b64 s[16:17], src_private_base
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    s_mov_b32 s17, s16
+; ISEL-NEXT:    s_mov_b64 s[22:23], 0
+; ISEL-NEXT:    s_mov_b32 s19, s23
+; ISEL-NEXT:    s_mov_b32 s20, -1
+; ISEL-NEXT:    s_lshr_b32 s16, s33, 6
+; ISEL-NEXT:    s_cmp_lg_u32 s16, s20
+; ISEL-NEXT:    s_cselect_b32 s20, s17, s19
+; ISEL-NEXT:    s_mov_b32 s17, s22
+; ISEL-NEXT:    s_cselect_b32 s19, s16, s17
+; ISEL-NEXT:    s_mov_b32 s16, s19
+; ISEL-NEXT:    s_mov_b32 s17, s20
+; ISEL-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; ISEL-NEXT:    v_writelane_b32 v41, s16, 0
+; ISEL-NEXT:    v_writelane_b32 v41, s17, 1
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    s_mov_b32 s18, s16
+; ISEL-NEXT:    s_getpc_b64 s[16:17]
+; ISEL-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; ISEL-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; ISEL-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; ISEL-NEXT:    s_mov_b64 s[22:23], s[2:3]
+; ISEL-NEXT:    s_mov_b64 s[20:21], s[0:1]
+; ISEL-NEXT:    s_mov_b64 s[0:1], s[20:21]
+; ISEL-NEXT:    s_mov_b64 s[2:3], s[22:23]
+; ISEL-NEXT:    v_mov_b32_e32 v0, s19
+; ISEL-NEXT:    v_mov_b32_e32 v1, s18
+; ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; ISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; ISEL-NEXT:    v_readlane_b32 s4, v41, 0
+; ISEL-NEXT:    v_readlane_b32 s5, v41, 1
+; ISEL-NEXT:    v_mov_b32_e32 v0, s4
+; ISEL-NEXT:    v_mov_b32_e32 v1, s5
+; ISEL-NEXT:    flat_load_dword v0, v[0:1]
+; ISEL-NEXT:    v_readlane_b32 s31, v40, 1
+; ISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; ISEL-NEXT:    s_mov_b32 s32, s33
+; ISEL-NEXT:    v_readlane_b32 s4, v40, 2
+; ISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; ISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; ISEL-NEXT:    s_mov_b32 s33, s4
+; ISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; ISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GI-LABEL: static_alloca:
+; GI:       ; %bb.0:
+; GI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT:    s_mov_b32 s16, s33
+; GI-NEXT:    s_mov_b32 s33, s32
+; GI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[18:19]
+; GI-NEXT:    v_writelane_b32 v40, s16, 2
+; GI-NEXT:    s_add_i32 s32, s32, 0x400
+; GI-NEXT:    v_writelane_b32 v40, s30, 0
+; GI-NEXT:    v_writelane_b32 v40, s31, 1
+; GI-NEXT:    s_lshr_b32 s17, s33, 6
+; GI-NEXT:    s_mov_b64 s[18:19], src_private_base
+; GI-NEXT:    ; kill: def $sgpr16 killed $sgpr18
+; GI-NEXT:    s_mov_b32 s16, s19
+; GI-NEXT:    s_mov_b32 s18, s17
+; GI-NEXT:    s_mov_b32 s19, s16
+; GI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; GI-NEXT:    v_writelane_b32 v41, s18, 0
+; GI-NEXT:    v_writelane_b32 v41, s19, 1
+; GI-NEXT:    v_mov_b32_e32 v0, s17
+; GI-NEXT:    v_mov_b32_e32 v1, s16
+; GI-NEXT:    s_mov_b64 s[18:19], s[2:3]
+; GI-NEXT:    s_mov_b64 s[16:17], s[0:1]
+; GI-NEXT:    s_mov_b64 s[0:1], s[16:17]
+; GI-NEXT:    s_mov_b64 s[2:3], s[18:19]
+; GI-NEXT:    s_getpc_b64 s[16:17]
+; GI-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; GI-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; GI-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GI-NEXT:    s_waitcnt lgkmcnt(0)
+; GI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GI-NEXT:    v_readlane_b32 s4, v41, 0
+; GI-NEXT:    v_readlane_b32 s5, v41, 1
+; GI-NEXT:    v_mov_b32_e32 v0, s4
+; GI-NEXT:    v_mov_b32_e32 v1, s5
+; GI-NEXT:    flat_load_dword v0, v[0:1]
+; GI-NEXT:    v_readlane_b32 s31, v40, 1
+; GI-NEXT:    v_readlane_b32 s30, v40, 0
+; GI-NEXT:    s_mov_b32 s32, s33
+; GI-NEXT:    v_readlane_b32 s4, v40, 2
+; GI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[6:7]
+; GI-NEXT:    s_mov_b32 s33, s4
+; GI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GI-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, align 4
+  call void @bar(ptr %alloca)
+  %load = load i32, ptr %alloca
+  ret i32 %load
+}
+
+define i32 @dynamic_alloca(i32 %n) {
+; ISEL-LABEL: dynamic_alloca:
+; ISEL:       ; %bb.0:
+; ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISEL-NEXT:    s_mov_b32 s16, s33
+; ISEL-NEXT:    s_mov_b32 s33, s32
+; ISEL-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; ISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[18:19]
+; ISEL-NEXT:    v_writelane_b32 v40, s16, 4
+; ISEL-NEXT:    v_writelane_b32 v40, s34, 2
+; ISEL-NEXT:    v_writelane_b32 v40, s35, 3
+; ISEL-NEXT:    s_add_i32 s32, s32, 0x800
+; ISEL-NEXT:    v_writelane_b32 v40, s30, 0
+; ISEL-NEXT:    v_writelane_b32 v40, s31, 1
+; ISEL-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; ISEL-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; ISEL-NEXT:    v_writelane_b32 v41, s15, 0
+; ISEL-NEXT:    v_writelane_b32 v41, s14, 1
+; ISEL-NEXT:    v_writelane_b32 v41, s13, 2
+; ISEL-NEXT:    v_writelane_b32 v41, s12, 3
+; ISEL-NEXT:    v_writelane_b32 v41, s10, 4
+; ISEL-NEXT:    v_writelane_b32 v41, s11, 5
+; ISEL-NEXT:    v_writelane_b32 v41, s8, 6
+; ISEL-NEXT:    v_writelane_b32 v41, s9, 7
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 8
+; ISEL-NEXT:    v_writelane_b32 v41, s7, 9
+; ISEL-NEXT:    v_writelane_b32 v41, s4, 10
+; ISEL-NEXT:    v_writelane_b32 v41, s5, 11
+; ISEL-NEXT:    s_mov_b32 s5, 15
+; ISEL-NEXT:    s_mov_b32 s4, 2
+; ISEL-NEXT:    v_mov_b32_e32 v1, s5
+; ISEL-NEXT:    v_lshl_add_u32 v0, v0, s4, v1
+; ISEL-NEXT:    s_mov_b32 s4, -16
+; ISEL-NEXT:    v_and_b32_e64 v0, v0, s4
+; ISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; ISEL-NEXT:    v_mov_b32_e32 v0, 0
+; ISEL-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 s[4:5], exec
+; ISEL-NEXT:    s_mov_b32 s6, 0
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 12
+; ISEL-NEXT:    v_writelane_b32 v41, s4, 13
+; ISEL-NEXT:    v_writelane_b32 v41, s5, 14
+; ISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; ISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    v_readlane_b32 s4, v41, 13
+; ISEL-NEXT:    v_readlane_b32 s5, v41, 14
+; ISEL-NEXT:    v_readlane_b32 s6, v41, 12
+; ISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    v_readlane_b32 s8, v0, s7
+; ISEL-NEXT:    s_max_u32 s6, s6, s8
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 15
+; ISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; ISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; ISEL-NEXT:    v_writelane_b32 v41, s6, 12
+; ISEL-NEXT:    v_writelane_b32 v41, s4, 13
+; ISEL-NEXT:    v_writelane_b32 v41, s5, 14
+; ISEL-NEXT:    s_mov_b64 s[34:35], exec
+; ISEL-NEXT:    s_mov_b64 exec, -1
+; ISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; ISEL-NEXT:  ; %bb.2:
+; ISEL-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[34:35]
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    v_readlane_b32 s15, v41, 0
+; ISEL-NEXT:    v_readlane_b32 s14, v41, 1
+; ISEL-NEXT:    v_readlane_b32 s13, v41, 2
+; ISEL-NEXT:    v_readlane_b32 s12, v41, 3
+; ISEL-NEXT:    v_readlane_b32 s10, v41, 4
+; ISEL-NEXT:    v_readlane_b32 s11, v41, 5
+; ISEL-NEXT:    v_readlane_b32 s8, v41, 6
+; ISEL-NEXT:    v_readlane_b32 s9, v41, 7
+; ISEL-NEXT:    v_readlane_b32 s6, v41, 8
+; ISEL-NEXT:    v_readlane_b32 s7, v41, 9
+; ISEL-NEXT:    v_readlane_b32 s4, v41, 10
+; ISEL-NEXT:    v_readlane_b32 s5, v41, 11
+; ISEL-NEXT:    v_readlane_b32 s16, v41, 15
+; ISEL-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b32 s19, s32
+; ISEL-NEXT:    s_mov_b32 s17, 6
+; ISEL-NEXT:    v_mov_b32_e32 v0, s17
+; ISEL-NEXT:    v_mov_b32_e32 v1, s19
+; ISEL-NEXT:    v_lshl_add_u32 v0, s16, v0, v1
+; ISEL-NEXT:    v_readfirstlane_b32 s20, v0
+; ISEL-NEXT:    s_mov_b64 s[16:17], src_private_base
+; ISEL-NEXT:    s_mov_b32 s18, 32
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; ISEL-NEXT:    s_mov_b64 s[22:23], 0
+; ISEL-NEXT:    s_mov_b32 s17, s23
+; ISEL-NEXT:    s_mov_b32 s21, -1
+; ISEL-NEXT:    s_cmp_lg_u32 s19, s21
+; ISEL-NEXT:    s_cselect_b32 s21, s16, s17
+; ISEL-NEXT:    ; implicit-def: $sgpr16
+; ISEL-NEXT:    ; implicit-def: $sgpr17
+; ISEL-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
+; ISEL-NEXT:    s_mov_b32 s17, s21
+; ISEL-NEXT:    s_mov_b32 s21, s22
+; ISEL-NEXT:    s_cselect_b32 s19, s19, s21
+; ISEL-NEXT:    s_mov_b32 s32, s20
+; ISEL-NEXT:    s_lshr_b64 s[16:17], s[16:17], s18
+; ISEL-NEXT:    s_mov_b32 s18, s16
+; ISEL-NEXT:    s_getpc_b64 s[16:17]
+; ISEL-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; ISEL-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; ISEL-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; ISEL-NEXT:    s_mov_b64 s[22:23], s[2:3]
+; ISEL-NEXT:    s_mov_b64 s[20:21], s[0:1]
+; ISEL-NEXT:    s_mov_b64 s[0:1], s[20:21]
+; ISEL-NEXT:    s_mov_b64 s[2:3], s[22:23]
+; ISEL-NEXT:    v_mov_b32_e32 v0, s19
+; ISEL-NEXT:    v_mov_b32_e32 v1, s18
+; ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; ISEL-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; ISEL-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; ISEL-NEXT:    v_readlane_b32 s31, v40, 1
+; ISEL-NEXT:    v_readlane_b32 s30, v40, 0
+; ISEL-NEXT:    s_mov_b32 s32, s33
+; ISEL-NEXT:    v_readlane_b32 s4, v40, 4
+; ISEL-NEXT:    v_readlane_b32 s34, v40, 2
+; ISEL-NEXT:    v_readlane_b32 s35, v40, 3
+; ISEL-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; ISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; ISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; ISEL-NEXT:    s_mov_b64 exec, s[6:7]
+; ISEL-NEXT:    s_mov_b32 s33, s4
+; ISEL-NEXT:    s_waitcnt vmcnt(0)
+; ISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GI-LABEL: dynamic_alloca:
+; GI:       ; %bb.0:
+; GI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GI-NEXT:    s_mov_b32 s16, s33
+; GI-NEXT:    s_mov_b32 s33, s32
+; GI-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; GI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[18:19]
+; GI-NEXT:    v_writelane_b32 v40, s16, 4
+; GI-NEXT:    v_writelane_b32 v40, s34, 2
+; GI-NEXT:    v_writelane_b32 v40, s35, 3
+; GI-NEXT:    s_add_i32 s32, s32, 0x800
+; GI-NEXT:    v_writelane_b32 v40, s30, 0
+; GI-NEXT:    v_writelane_b32 v40, s31, 1
+; GI-NEXT:    buffer_store_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GI-NEXT:    ; implicit-def: $vgpr41 : SGPR spill to VGPR lane
+; GI-NEXT:    v_writelane_b32 v41, s15, 0
+; GI-NEXT:    v_writelane_b32 v41, s14, 1
+; GI-NEXT:    v_writelane_b32 v41, s13, 2
+; GI-NEXT:    v_writelane_b32 v41, s12, 3
+; GI-NEXT:    v_writelane_b32 v41, s10, 4
+; GI-NEXT:    v_writelane_b32 v41, s11, 5
+; GI-NEXT:    v_writelane_b32 v41, s8, 6
+; GI-NEXT:    v_writelane_b32 v41, s9, 7
+; GI-NEXT:    v_writelane_b32 v41, s6, 8
+; GI-NEXT:    v_writelane_b32 v41, s7, 9
+; GI-NEXT:    v_writelane_b32 v41, s4, 10
+; GI-NEXT:    v_writelane_b32 v41, s5, 11
+; GI-NEXT:    v_mov_b32_e32 v1, v0
+; GI-NEXT:    v_mov_b32_e32 v0, 0
+; GI-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GI-NEXT:    v_mov_b32_e32 v2, v0
+; GI-NEXT:    s_mov_b32 s4, 2
+; GI-NEXT:    v_mov_b32_e32 v0, s4
+; GI-NEXT:    v_lshlrev_b64 v[1:2], v0, v[1:2]
+; GI-NEXT:    v_mov_b32_e32 v0, v1
+; GI-NEXT:    v_mov_b32_e32 v1, v2
+; GI-NEXT:    s_mov_b32 s4, 15
+; GI-NEXT:    s_mov_b32 s6, 0
+; GI-NEXT:    v_mov_b32_e32 v2, s4
+; GI-NEXT:    v_add_co_u32_e64 v0, s[4:5], v0, v2
+; GI-NEXT:    v_mov_b32_e32 v2, s6
+; GI-NEXT:    v_addc_co_u32_e64 v2, s[4:5], v1, v2, s[4:5]
+; GI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GI-NEXT:    v_mov_b32_e32 v1, v2
+; GI-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GI-NEXT:    s_mov_b32 s4, -16
+; GI-NEXT:    v_mov_b32_e32 v1, s4
+; GI-NEXT:    v_and_b32_e64 v0, v0, v1
+; GI-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 s[4:5], exec
+; GI-NEXT:    s_mov_b32 s6, 0
+; GI-NEXT:    v_writelane_b32 v41, s6, 12
+; GI-NEXT:    v_writelane_b32 v41, s4, 13
+; GI-NEXT:    v_writelane_b32 v41, s5, 14
+; GI-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GI-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    v_readlane_b32 s4, v41, 13
+; GI-NEXT:    v_readlane_b32 s5, v41, 14
+; GI-NEXT:    v_readlane_b32 s6, v41, 12
+; GI-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GI-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    v_readlane_b32 s8, v0, s7
+; GI-NEXT:    s_max_u32 s6, s6, s8
+; GI-NEXT:    v_writelane_b32 v41, s6, 15
+; GI-NEXT:    s_bitset0_b64 s[4:5], s7
+; GI-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GI-NEXT:    v_writelane_b32 v41, s6, 12
+; GI-NEXT:    v_writelane_b32 v41, s4, 13
+; GI-NEXT:    v_writelane_b32 v41, s5, 14
+; GI-NEXT:    s_mov_b64 s[34:35], exec
+; GI-NEXT:    s_mov_b64 exec, -1
+; GI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:    s_cbranch_scc1 .LBB1_1
+; GI-NEXT:  ; %bb.2:
+; GI-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[34:35]
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    v_readlane_b32 s15, v41, 0
+; GI-NEXT:    v_readlane_b32 s14, v41, 1
+; GI-NEXT:    v_readlane_b32 s13, v41, 2
+; GI-NEXT:    v_readlane_b32 s12, v41, 3
+; GI-NEXT:    v_readlane_b32 s10, v41, 4
+; GI-NEXT:    v_readlane_b32 s11, v41, 5
+; GI-NEXT:    v_readlane_b32 s8, v41, 6
+; GI-NEXT:    v_readlane_b32 s9, v41, 7
+; GI-NEXT:    v_readlane_b32 s6, v41, 8
+; GI-NEXT:    v_readlane_b32 s7, v41, 9
+; GI-NEXT:    v_readlane_b32 s4, v41, 10
+; GI-NEXT:    v_readlane_b32 s5, v41, 11
+; GI-NEXT:    v_readlane_b32 s16, v41, 15
+; GI-NEXT:    buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b32 s17, 6
+; GI-NEXT:    s_lshl_b32 s16, s16, s17
+; GI-NEXT:    s_mov_b32 s20, s32
+; GI-NEXT:    s_add_u32 s16, s20, s16
+; GI-NEXT:    s_mov_b32 s32, s16
+; GI-NEXT:    s_mov_b64 s[16:17], src_private_base
+; GI-NEXT:    ; kill: def $sgpr18 killed $sgpr16
+; GI-NEXT:    s_mov_b32 s18, s17
+; GI-NEXT:    s_mov_b32 s16, s20
+; GI-NEXT:    s_mov_b32 s17, s18
+; GI-NEXT:    s_mov_b32 s21, -1
+; GI-NEXT:    s_mov_b64 s[18:19], 0
+; GI-NEXT:    s_cmp_lg_u32 s20, s21
+; GI-NEXT:    s_cselect_b32 s20, 1, 0
+; GI-NEXT:    s_cmp_lg_u32 s20, 0
+; GI-NEXT:    s_cselect_b64 s[18:19], s[16:17], s[18:19]
+; GI-NEXT:    s_mov_b32 s17, s18
+; GI-NEXT:    s_mov_b32 s16, s19
+; GI-NEXT:    v_mov_b32_e32 v0, s17
+; GI-NEXT:    v_mov_b32_e32 v1, s16
+; GI-NEXT:    s_mov_b64 s[18:19], s[2:3]
+; GI-NEXT:    s_mov_b64 s[16:17], s[0:1]
+; GI-NEXT:    s_mov_b64 s[0:1], s[16:17]
+; GI-NEXT:    s_mov_b64 s[2:3], s[18:19]
+; GI-NEXT:    s_getpc_b64 s[16:17]
+; GI-NEXT:    s_add_u32 s16, s16, bar at gotpcrel32@lo+4
+; GI-NEXT:    s_addc_u32 s17, s17, bar at gotpcrel32@hi+12
+; GI-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GI-NEXT:    s_waitcnt lgkmcnt(0)
+; GI-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GI-NEXT:    s_mov_b32 s4, 0
+; GI-NEXT:    v_mov_b32_e32 v0, s4
+; GI-NEXT:    v_readlane_b32 s31, v40, 1
+; GI-NEXT:    v_readlane_b32 s30, v40, 0
+; GI-NEXT:    s_mov_b32 s32, s33
+; GI-NEXT:    v_readlane_b32 s4, v40, 4
+; GI-NEXT:    v_readlane_b32 s34, v40, 2
+; GI-NEXT:    v_readlane_b32 s35, v40, 3
+; GI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GI-NEXT:    s_mov_b64 exec, s[6:7]
+; GI-NEXT:    s_mov_b32 s33, s4
+; GI-NEXT:    s_waitcnt vmcnt(0)
+; GI-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca i32, i32 %n, align 4
+  call void @bar(ptr %alloca)
+  %load = load i32, ptr %alloca
+  ret i32 0
+}
----------------
shiltian wrote:

It is not necessary actually because even if we have `i32` size, it is promoted to `i64` automatically. That's why I have the cast there.

https://github.com/llvm/llvm-project/pull/136584


More information about the llvm-commits mailing list