[llvm] 8aff59d - [NFC][AMDGPU] Auto generate check lines for three test cases (#127352)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 17 08:22:11 PST 2025
Author: Shilei Tian
Date: 2025-02-17T11:22:08-05:00
New Revision: 8aff59d3f4e53751b23cd3bc22a74f8677c57d7d
URL: https://github.com/llvm/llvm-project/commit/8aff59d3f4e53751b23cd3bc22a74f8677c57d7d
DIFF: https://github.com/llvm/llvm-project/commit/8aff59d3f4e53751b23cd3bc22a74f8677c57d7d.diff
LOG: [NFC][AMDGPU] Auto generate check lines for three test cases (#127352)
- `CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll`
- `CodeGen/AMDGPU/call-preserved-registers.ll`
- `CodeGen/AMDGPU/stack-realign.ll`
This is to make preparation for another PR.
Added:
Modified:
llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
llvm/test/CodeGen/AMDGPU/stack-realign.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index ff80e05197b0d..db9ce56ecc3cc 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
@@ -5,110 +6,258 @@
declare hidden void @external_void_func_void() #3
-; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
-; GCN: s_getpc_b64 s[34:35]
-; GCN-NEXT: s_add_u32 s34, s34,
-; GCN-NEXT: s_addc_u32 s35, s35,
-; GCN: s_swappc_b64 s[30:31], s[34:35]
-
-; GCN-NEXT: #ASMSTART
-; GCN-NEXT: #ASMEND
-; GCN-NEXT: s_swappc_b64 s[30:31], s[34:35]
define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+; FLATSCR-LABEL: test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[34:35]
+; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; FLATSCR-NEXT: s_endpgm
call void @external_void_func_void()
call void asm sideeffect "", ""() #0
call void @external_void_func_void()
ret void
}
-; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; MUBUF: buffer_store_dword
-; FLATSCR: scratch_store_dword
-; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4
-; GCN: v_writelane_b32 v40, s30, 0
-; GCN: v_writelane_b32 v40, s31, 1
-; GCN: v_writelane_b32 v40, s34, 2
-; GCN: v_writelane_b32 v40, s35, 3
-
-; GCN: s_swappc_b64
-; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: s_swappc_b64
-; GCN: v_readlane_b32 s35, v40, 3
-; GCN: v_readlane_b32 s34, v40, 2
-; MUBUF-DAG: v_readlane_b32 s31, v40, 1
-; MUBUF-DAG: v_readlane_b32 s30, v40, 0
-; FLATSCR-DAG: v_readlane_b32 s31, v40, 1
-; FLATSCR-DAG: v_readlane_b32 s30, v40, 0
-
-; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
-; MUBUF: buffer_load_dword
-; FLATSCR: scratch_load_dword
-; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN: s_setpc_b64 s[30:31]
define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
+; MUBUF-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT: s_mov_b32 s33, s32
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
+; MUBUF-NEXT: v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT: s_getpc_b64 s[34:35]
+; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; MUBUF-NEXT: v_readlane_b32 s35, v40, 3
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: s_mov_b32 s32, s33
+; MUBUF-NEXT: v_readlane_b32 s4, v40, 4
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_mov_b32 s33, s4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_mov_b32 s0, s33
+; FLATSCR-NEXT: s_mov_b32 s33, s32
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT: s_getpc_b64 s[34:35]
+; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: s_mov_b32 s32, s33
+; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_mov_b32 s33, s0
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_void()
call void asm sideeffect "", ""() #0
call void @external_void_func_void()
ret void
}
-; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN: s_mov_b32 s33, s32
-; MUBUF: buffer_store_dword v40
-; FLATSCR: scratch_store_dword off, v40
-; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4
-; MUBUF: s_addk_i32 s32, 0x400
-; FLATSCR: s_add_i32 s32, s32, 16
-
-; GCN: s_swappc_b64
-; GCN-NEXT: s_swappc_b64
-
-; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4
-; MUBUF: buffer_load_dword v40
-; FLATSCR: scratch_load_dword v40
-; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
define void @test_func_call_external_void_funcx2() #0 {
+; MUBUF-LABEL: test_func_call_external_void_funcx2:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT: s_mov_b32 s33, s32
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: v_writelane_b32 v40, s4, 4
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s34, 2
+; MUBUF-NEXT: v_writelane_b32 v40, s35, 3
+; MUBUF-NEXT: s_getpc_b64 s[34:35]
+; MUBUF-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; MUBUF-NEXT: v_readlane_b32 s35, v40, 3
+; MUBUF-NEXT: v_readlane_b32 s34, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: s_mov_b32 s32, s33
+; MUBUF-NEXT: v_readlane_b32 s4, v40, 4
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_mov_b32 s33, s4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: test_func_call_external_void_funcx2:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_mov_b32 s0, s33
+; FLATSCR-NEXT: s_mov_b32 s33, s32
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: v_writelane_b32 v40, s0, 4
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s34, 2
+; FLATSCR-NEXT: v_writelane_b32 v40, s35, 3
+; FLATSCR-NEXT: s_getpc_b64 s[34:35]
+; FLATSCR-NEXT: s_add_u32 s34, s34, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s35, s35, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[34:35]
+; FLATSCR-NEXT: v_readlane_b32 s35, v40, 3
+; FLATSCR-NEXT: v_readlane_b32 s34, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: s_mov_b32 s32, s33
+; FLATSCR-NEXT: v_readlane_b32 s0, v40, 4
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_mov_b32 s33, s0
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_void()
call void @external_void_func_void()
ret void
}
-; GCN-LABEL: {{^}}void_func_void_clobber_s30_s31:
-; GCN: s_waitcnt
-; GCN: v_writelane_b32 v0, s30, 0
-; GCN: v_writelane_b32 v0, s31, 1
-; GCN-NEXT: #ASMSTART
-; GCN: ; clobber
-; GCN-NEXT: #ASMEND
-; GCN: v_readlane_b32 s31, v0, 1
-; GCN: v_readlane_b32 s30, v0, 0
-; GCN: s_setpc_b64 s[30:31]
define void @void_func_void_clobber_s30_s31() #2 {
+; MUBUF-LABEL: void_func_void_clobber_s30_s31:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: v_writelane_b32 v0, s30, 0
+; MUBUF-NEXT: v_writelane_b32 v0, s31, 1
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; clobber
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: v_readlane_b32 s31, v0, 1
+; MUBUF-NEXT: v_readlane_b32 s30, v0, 0
+; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: void_func_void_clobber_s30_s31:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: v_writelane_b32 v0, s30, 0
+; FLATSCR-NEXT: v_writelane_b32 v0, s31, 1
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; clobber
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s31, v0, 1
+; FLATSCR-NEXT: v_readlane_b32 s30, v0, 0
+; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; clobber", "~{s[30:31]}"() #0
ret void
}
-; GCN-LABEL: {{^}}void_func_void_clobber_vcc:
-; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: s_setpc_b64 s[30:31]
define hidden void @void_func_void_clobber_vcc() #2 {
+; GCN-LABEL: void_func_void_clobber_vcc:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "", "~{vcc}"() #0
ret void
}
-; GCN-LABEL: {{^}}test_call_void_func_void_clobber_vcc:
-; GCN: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN: s_mov_b64 s[34:35], vcc
-; GCN-NEXT: s_swappc_b64
-; GCN: s_mov_b64 vcc, s[34:35]
define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 {
+; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_add_u32 s8, s4, 8
+; FLATSCR-NEXT: s_addc_u32 s9, s5, 0
+; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; FLATSCR-NEXT: s_mov_b32 s14, s12
+; FLATSCR-NEXT: s_mov_b32 s13, s11
+; FLATSCR-NEXT: s_mov_b32 s12, s10
+; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7]
+; FLATSCR-NEXT: s_getpc_b64 s[16:17]
+; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_vcc at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_vcc at rel32@hi+12
+; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2
+; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1]
+; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3]
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def vcc
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_mov_b64 s[34:35], vcc
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_mov_b64 vcc, s[34:35]
+; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1
+; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use vcc
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
call void @void_func_void_clobber_vcc()
%val0 = load volatile i32, ptr addrspace(1) undef
@@ -117,22 +266,50 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1)
ret void
}
-; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31:
-; GCN: s_mov_b32 s33, s31
-; GCN: s_swappc_b64
-; GCN-NEXT: s_mov_b32 s31, s33
define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(ptr addrspace(1) %out) #0 {
+; FLATSCR-LABEL: test_call_void_func_void_mayclobber_s31:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def s31
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_mov_b32 s33, s31
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: s_mov_b32 s31, s33
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use s31
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%s31 = call i32 asm sideeffect "; def $0", "={s31}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{s31}"(i32 %s31)
ret void
}
-; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
-; GCN: v_mov_b32_e32 v40, v31
-; GCN: s_swappc_b64
-; GCN-NEXT: v_mov_b32_e32 v31, v40
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1) %out) #0 {
+; FLATSCR-LABEL: test_call_void_func_void_mayclobber_v31:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def v31
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_mov_b32_e32 v40, v31
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: v_mov_b32_e32 v31, v40
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use v31
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{v31}"(i32 %v31)
@@ -140,175 +317,294 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(ptr addrspace
}
; FIXME: What is the expected behavior for reserved registers here?
-
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33:
-; FLATSCR: s_getpc_b64 s[0:1]
-; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; MUBUF: s_getpc_b64 s[4:5]
-; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-
-; GCN: #ASMSTART
-; GCN-NEXT: ; def s33
-; GCN-NEXT: #ASMEND
-
-; GCN-NOT: s33
-
-; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
-; MUBUF: s_swappc_b64 s[30:31], s[4:5]
-
-; GCN-NOT: s33
-
-; GCN: ;;#ASMSTART
-; GCN-NEXT: ; use s33
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_s33(ptr addrspace(1) %out) #0 {
+; FLATSCR-LABEL: test_call_void_func_void_preserves_s33:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def s33
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use s33
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%s33 = call i32 asm sideeffect "; def $0", "={s33}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{s33}"(i32 %s33)
ret void
}
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}}
-; GCN-NOT: s34
-
-; FLATSCR: s_getpc_b64 s[0:1]
-; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; MUBUF: s_getpc_b64 s[4:5]
-; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; GCN: s_mov_b32 s32, 0
-
-; GCN: ;;#ASMSTART
-; GCN-NEXT: ; def s34
-; GCN-NEXT: ;;#ASMEND
-
-; GCN-NOT: s34
-
-; MUBUF: s_swappc_b64 s[30:31], s[4:5]
-; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
-
-; GCN-NOT: s34
-
-; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ; use s34
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_s34(ptr addrspace(1) %out) #0 {
+; FLATSCR-LABEL: test_call_void_func_void_preserves_s34:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def s34
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use s34
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%s34 = call i32 asm sideeffect "; def $0", "={s34}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{s34}"(i32 %s34)
ret void
}
-; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
-
-; MUBUF: s_getpc_b64 s[4:5]
-; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
-; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
-; FLATSCR: s_getpc_b64 s[0:1]
-; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
-; GCN: s_mov_b32 s32, 0
-
-; GCN: ;;#ASMSTART
-; GCN-NEXT: ; def v40
-; GCN-NEXT: ;;#ASMEND
-
-; GCN-NOT: v40
-
-; MUBUF: s_swappc_b64 s[30:31], s[4:5]
-; FLATSCR: s_swappc_b64 s[30:31], s[0:1]
-
-; GCN-NOT: v40
-
-; GCN: ;;#ASMSTART
-; GCN-NEXT: ; use v40
-; GCN-NEXT: ;;#ASMEND
-; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_v40(ptr addrspace(1) %out) #0 {
+; FLATSCR-LABEL: test_call_void_func_void_preserves_v40:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def v40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use v40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%v40 = call i32 asm sideeffect "; def $0", "={v40}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
ret void
}
-; GCN-LABEL: {{^}}void_func_void_clobber_s33:
-; GCN: v_writelane_b32 v0, s33, 0
-; GCN-NEXT: #ASMSTART
-; GCN-NEXT: ; clobber
-; GCN-NEXT: #ASMEND
-; GCN-NEXT: v_readlane_b32 s33, v0, 0
-; GCN: s_setpc_b64
define hidden void @void_func_void_clobber_s33() #2 {
+; MUBUF-LABEL: void_func_void_clobber_s33:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: v_writelane_b32 v0, s33, 0
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; clobber
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: v_readlane_b32 s33, v0, 0
+; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: void_func_void_clobber_s33:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: v_writelane_b32 v0, s33, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; clobber
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s33, v0, 0
+; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; clobber", "~{s33}"() #0
ret void
}
-; GCN-LABEL: {{^}}void_func_void_clobber_s34:
-; GCN: v_writelane_b32 v0, s34, 0
-; GCN-NEXT: #ASMSTART
-; GCN-NEXT: ; clobber
-; GCN-NEXT: #ASMEND
-; GCN-NEXT: v_readlane_b32 s34, v0, 0
-; GCN: s_setpc_b64
define hidden void @void_func_void_clobber_s34() #2 {
+; MUBUF-LABEL: void_func_void_clobber_s34:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: v_writelane_b32 v0, s34, 0
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; clobber
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: v_readlane_b32 s34, v0, 0
+; MUBUF-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; MUBUF-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[4:5]
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: void_func_void_clobber_s34:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: v_writelane_b32 v0, s34, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; clobber
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s34, v0, 0
+; FLATSCR-NEXT: s_xor_saveexec_b64 s[0:1], -1
+; FLATSCR-NEXT: scratch_load_dword v0, off, s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[0:1]
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "; clobber", "~{s34}"() #0
ret void
}
-; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33:
-; GCN: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN: s_mov_b32 s32, 0
-; GCN: s_swappc_b64
-; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
+; FLATSCR-LABEL: test_call_void_func_void_clobber_s33:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; FLATSCR-NEXT: s_mov_b32 s14, s12
+; FLATSCR-NEXT: s_mov_b32 s13, s11
+; FLATSCR-NEXT: s_mov_b32 s12, s10
+; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7]
+; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5]
+; FLATSCR-NEXT: s_getpc_b64 s[16:17]
+; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s33 at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s33 at rel32@hi+12
+; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2
+; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1]
+; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3]
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; FLATSCR-NEXT: s_endpgm
call void @void_func_void_clobber_s33()
ret void
}
-; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34:
-; GCN: s_getpc_b64
-; GCN-NEXT: s_add_u32
-; GCN-NEXT: s_addc_u32
-; GCN: s_mov_b32 s32, 0
-; GCN: s_swappc_b64
-; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {
+; FLATSCR-LABEL: test_call_void_func_void_clobber_s34:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; FLATSCR-NEXT: s_mov_b32 s14, s12
+; FLATSCR-NEXT: s_mov_b32 s13, s11
+; FLATSCR-NEXT: s_mov_b32 s12, s10
+; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7]
+; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5]
+; FLATSCR-NEXT: s_getpc_b64 s[16:17]
+; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s34 at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s34 at rel32@hi+12
+; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2
+; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1]
+; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3]
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; FLATSCR-NEXT: s_endpgm
call void @void_func_void_clobber_s34()
ret void
}
-; GCN-LABEL: {{^}}callee_saved_sgpr_func:
-; GCN-NOT: s40
-; GCN: v_writelane_b32 v40, s40
-; GCN: s_swappc_b64
-; GCN-NOT: s40
-; GCN: ; use s40
-; GCN-NOT: s40
-; GCN: v_readlane_b32 s40, v40
-; GCN-NOT: s40
define void @callee_saved_sgpr_func() #2 {
+; MUBUF-LABEL: callee_saved_sgpr_func:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT: s_mov_b32 s33, s32
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: v_writelane_b32 v40, s4, 3
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v40, s30, 0
+; MUBUF-NEXT: v_writelane_b32 v40, s31, 1
+; MUBUF-NEXT: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: v_writelane_b32 v40, s40, 2
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; def s40
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; use s40
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: v_readlane_b32 s40, v40, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v40, 1
+; MUBUF-NEXT: v_readlane_b32 s30, v40, 0
+; MUBUF-NEXT: s_mov_b32 s32, s33
+; MUBUF-NEXT: v_readlane_b32 s4, v40, 3
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_mov_b32 s33, s4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_saved_sgpr_func:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_mov_b32 s0, s33
+; FLATSCR-NEXT: s_mov_b32 s33, s32
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: v_writelane_b32 v40, s0, 3
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0
+; FLATSCR-NEXT: v_writelane_b32 v40, s31, 1
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: v_writelane_b32 v40, s40, 2
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_readlane_b32 s40, v40, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1
+; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0
+; FLATSCR-NEXT: s_mov_b32 s32, s33
+; FLATSCR-NEXT: v_readlane_b32 s0, v40, 3
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_mov_b32 s33, s0
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[30:31]
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
ret void
}
-; GCN-LABEL: {{^}}callee_saved_sgpr_kernel:
-; GCN-NOT: s40
-; GCN: ; def s40
-; GCN-NOT: s40
-; GCN: s_swappc_b64
-; GCN-NOT: s40
-; GCN: ; use s40
-; GCN-NOT: s40
define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
+; FLATSCR-LABEL: callee_saved_sgpr_kernel:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
@@ -316,16 +612,92 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
}
; First call preserved VGPR is used so it can't be used for SGPR spills.
-; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
-; GCN-NOT: s40
-; GCN: v_writelane_b32 v41, s40
-; GCN: s_swappc_b64
-; GCN-NOT: s40
-; GCN: ; use s40
-; GCN-NOT: s40
-; GCN: v_readlane_b32 s40, v41
-; GCN-NOT: s40
define void @callee_saved_sgpr_vgpr_func() #2 {
+; MUBUF-LABEL: callee_saved_sgpr_vgpr_func:
+; MUBUF: ; %bb.0:
+; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT: s_mov_b32 s4, s33
+; MUBUF-NEXT: s_mov_b32 s33, s32
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: v_writelane_b32 v41, s4, 3
+; MUBUF-NEXT: s_addk_i32 s32, 0x400
+; MUBUF-NEXT: v_writelane_b32 v41, s30, 0
+; MUBUF-NEXT: v_writelane_b32 v41, s31, 1
+; MUBUF-NEXT: s_getpc_b64 s[4:5]
+; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_void at rel32@lo+4
+; MUBUF-NEXT: s_addc_u32 s5, s5, external_void_func_void at rel32@hi+12
+; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT: v_writelane_b32 v41, s40, 2
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; def s40
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; def v40
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; use s40
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: ;;#ASMSTART
+; MUBUF-NEXT: ; use v40
+; MUBUF-NEXT: ;;#ASMEND
+; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; MUBUF-NEXT: v_readlane_b32 s40, v41, 2
+; MUBUF-NEXT: v_readlane_b32 s31, v41, 1
+; MUBUF-NEXT: v_readlane_b32 s30, v41, 0
+; MUBUF-NEXT: s_mov_b32 s32, s33
+; MUBUF-NEXT: v_readlane_b32 s4, v41, 3
+; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1
+; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT: s_mov_b64 exec, s[6:7]
+; MUBUF-NEXT: s_mov_b32 s33, s4
+; MUBUF-NEXT: s_waitcnt vmcnt(0)
+; MUBUF-NEXT: s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: callee_saved_sgpr_vgpr_func:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: s_mov_b32 s0, s33
+; FLATSCR-NEXT: s_mov_b32 s33, s32
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: v_writelane_b32 v41, s0, 3
+; FLATSCR-NEXT: s_add_i32 s32, s32, 16
+; FLATSCR-NEXT: v_writelane_b32 v41, s30, 0
+; FLATSCR-NEXT: v_writelane_b32 v41, s31, 1
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT: v_writelane_b32 v41, s40, 2
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def v40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use v40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; FLATSCR-NEXT: v_readlane_b32 s40, v41, 2
+; FLATSCR-NEXT: v_readlane_b32 s31, v41, 1
+; FLATSCR-NEXT: v_readlane_b32 s30, v41, 0
+; FLATSCR-NEXT: s_mov_b32 s32, s33
+; FLATSCR-NEXT: v_readlane_b32 s0, v41, 3
+; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1
+; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_mov_b64 exec, s[2:3]
+; FLATSCR-NEXT: s_mov_b32 s33, s0
+; FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT: s_setpc_b64 s[30:31]
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
%v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0
call void @external_void_func_void()
@@ -334,15 +706,30 @@ define void @callee_saved_sgpr_vgpr_func() #2 {
ret void
}
-; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_kernel:
-; GCN-NOT: s40
-; GCN: ; def s40
-; GCN-NOT: s40
-; GCN: s_swappc_b64
-; GCN-NOT: s40
-; GCN: ; use s40
-; GCN-NOT: s40
define amdgpu_kernel void @callee_saved_sgpr_vgpr_kernel() #2 {
+; FLATSCR-LABEL: callee_saved_sgpr_vgpr_kernel:
+; FLATSCR: ; %bb.0:
+; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
+; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; FLATSCR-NEXT: s_getpc_b64 s[0:1]
+; FLATSCR-NEXT: s_add_u32 s0, s0, external_void_func_void at rel32@lo+4
+; FLATSCR-NEXT: s_addc_u32 s1, s1, external_void_func_void at rel32@hi+12
+; FLATSCR-NEXT: s_mov_b32 s32, 0
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; def v32
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: v_mov_b32_e32 v40, v32
+; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use s40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: ;;#ASMSTART
+; FLATSCR-NEXT: ; use v40
+; FLATSCR-NEXT: ;;#ASMEND
+; FLATSCR-NEXT: s_endpgm
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
%v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
call void @external_void_func_void()
diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
index d2b960fe43f84..0d6bccad89d82 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@@ -1,13 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
-; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs:
-; CHECK-DAG: v_writelane_b32 v0, s98, 63
-; CHECK-DAG: v_writelane_b32 v1, s99, 0
-; CHECK-NOT: dummy
-; CHECK-DAG: v_readlane_b32 s99, v1, 0
-; CHECK-DAG: v_readlane_b32 s98, v0, 63
-
define void @spill_more_than_wavesize_csr_sgprs() {
+; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[4:5]
+; CHECK-NEXT: v_writelane_b32 v0, s35, 0
+; CHECK-NEXT: v_writelane_b32 v0, s36, 1
+; CHECK-NEXT: v_writelane_b32 v0, s37, 2
+; CHECK-NEXT: v_writelane_b32 v0, s38, 3
+; CHECK-NEXT: v_writelane_b32 v0, s39, 4
+; CHECK-NEXT: v_writelane_b32 v0, s40, 5
+; CHECK-NEXT: v_writelane_b32 v0, s41, 6
+; CHECK-NEXT: v_writelane_b32 v0, s42, 7
+; CHECK-NEXT: v_writelane_b32 v0, s43, 8
+; CHECK-NEXT: v_writelane_b32 v0, s44, 9
+; CHECK-NEXT: v_writelane_b32 v0, s45, 10
+; CHECK-NEXT: v_writelane_b32 v0, s46, 11
+; CHECK-NEXT: v_writelane_b32 v0, s47, 12
+; CHECK-NEXT: v_writelane_b32 v0, s48, 13
+; CHECK-NEXT: v_writelane_b32 v0, s49, 14
+; CHECK-NEXT: v_writelane_b32 v0, s50, 15
+; CHECK-NEXT: v_writelane_b32 v0, s51, 16
+; CHECK-NEXT: v_writelane_b32 v0, s52, 17
+; CHECK-NEXT: v_writelane_b32 v0, s53, 18
+; CHECK-NEXT: v_writelane_b32 v0, s54, 19
+; CHECK-NEXT: v_writelane_b32 v0, s55, 20
+; CHECK-NEXT: v_writelane_b32 v0, s56, 21
+; CHECK-NEXT: v_writelane_b32 v0, s57, 22
+; CHECK-NEXT: v_writelane_b32 v0, s58, 23
+; CHECK-NEXT: v_writelane_b32 v0, s59, 24
+; CHECK-NEXT: v_writelane_b32 v0, s60, 25
+; CHECK-NEXT: v_writelane_b32 v0, s61, 26
+; CHECK-NEXT: v_writelane_b32 v0, s62, 27
+; CHECK-NEXT: v_writelane_b32 v0, s63, 28
+; CHECK-NEXT: v_writelane_b32 v0, s64, 29
+; CHECK-NEXT: v_writelane_b32 v0, s65, 30
+; CHECK-NEXT: v_writelane_b32 v0, s66, 31
+; CHECK-NEXT: v_writelane_b32 v0, s67, 32
+; CHECK-NEXT: v_writelane_b32 v0, s68, 33
+; CHECK-NEXT: v_writelane_b32 v0, s69, 34
+; CHECK-NEXT: v_writelane_b32 v0, s70, 35
+; CHECK-NEXT: v_writelane_b32 v0, s71, 36
+; CHECK-NEXT: v_writelane_b32 v0, s72, 37
+; CHECK-NEXT: v_writelane_b32 v0, s73, 38
+; CHECK-NEXT: v_writelane_b32 v0, s74, 39
+; CHECK-NEXT: v_writelane_b32 v0, s75, 40
+; CHECK-NEXT: v_writelane_b32 v0, s76, 41
+; CHECK-NEXT: v_writelane_b32 v0, s77, 42
+; CHECK-NEXT: v_writelane_b32 v0, s78, 43
+; CHECK-NEXT: v_writelane_b32 v0, s79, 44
+; CHECK-NEXT: v_writelane_b32 v0, s80, 45
+; CHECK-NEXT: v_writelane_b32 v0, s81, 46
+; CHECK-NEXT: v_writelane_b32 v0, s82, 47
+; CHECK-NEXT: v_writelane_b32 v0, s83, 48
+; CHECK-NEXT: v_writelane_b32 v0, s84, 49
+; CHECK-NEXT: v_writelane_b32 v0, s85, 50
+; CHECK-NEXT: v_writelane_b32 v0, s86, 51
+; CHECK-NEXT: v_writelane_b32 v0, s87, 52
+; CHECK-NEXT: v_writelane_b32 v0, s88, 53
+; CHECK-NEXT: v_writelane_b32 v0, s89, 54
+; CHECK-NEXT: v_writelane_b32 v0, s90, 55
+; CHECK-NEXT: v_writelane_b32 v0, s91, 56
+; CHECK-NEXT: v_writelane_b32 v0, s92, 57
+; CHECK-NEXT: v_writelane_b32 v0, s93, 58
+; CHECK-NEXT: v_writelane_b32 v0, s94, 59
+; CHECK-NEXT: v_writelane_b32 v0, s95, 60
+; CHECK-NEXT: v_writelane_b32 v1, s99, 0
+; CHECK-NEXT: v_writelane_b32 v0, s96, 61
+; CHECK-NEXT: v_writelane_b32 v1, s100, 1
+; CHECK-NEXT: v_writelane_b32 v0, s97, 62
+; CHECK-NEXT: v_writelane_b32 v1, s101, 2
+; CHECK-NEXT: v_writelane_b32 v0, s98, 63
+; CHECK-NEXT: v_writelane_b32 v1, s102, 3
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s102, v1, 3
+; CHECK-NEXT: v_readlane_b32 s101, v1, 2
+; CHECK-NEXT: v_readlane_b32 s100, v1, 1
+; CHECK-NEXT: v_readlane_b32 s99, v1, 0
+; CHECK-NEXT: v_readlane_b32 s98, v0, 63
+; CHECK-NEXT: v_readlane_b32 s97, v0, 62
+; CHECK-NEXT: v_readlane_b32 s96, v0, 61
+; CHECK-NEXT: v_readlane_b32 s95, v0, 60
+; CHECK-NEXT: v_readlane_b32 s94, v0, 59
+; CHECK-NEXT: v_readlane_b32 s93, v0, 58
+; CHECK-NEXT: v_readlane_b32 s92, v0, 57
+; CHECK-NEXT: v_readlane_b32 s91, v0, 56
+; CHECK-NEXT: v_readlane_b32 s90, v0, 55
+; CHECK-NEXT: v_readlane_b32 s89, v0, 54
+; CHECK-NEXT: v_readlane_b32 s88, v0, 53
+; CHECK-NEXT: v_readlane_b32 s87, v0, 52
+; CHECK-NEXT: v_readlane_b32 s86, v0, 51
+; CHECK-NEXT: v_readlane_b32 s85, v0, 50
+; CHECK-NEXT: v_readlane_b32 s84, v0, 49
+; CHECK-NEXT: v_readlane_b32 s83, v0, 48
+; CHECK-NEXT: v_readlane_b32 s82, v0, 47
+; CHECK-NEXT: v_readlane_b32 s81, v0, 46
+; CHECK-NEXT: v_readlane_b32 s80, v0, 45
+; CHECK-NEXT: v_readlane_b32 s79, v0, 44
+; CHECK-NEXT: v_readlane_b32 s78, v0, 43
+; CHECK-NEXT: v_readlane_b32 s77, v0, 42
+; CHECK-NEXT: v_readlane_b32 s76, v0, 41
+; CHECK-NEXT: v_readlane_b32 s75, v0, 40
+; CHECK-NEXT: v_readlane_b32 s74, v0, 39
+; CHECK-NEXT: v_readlane_b32 s73, v0, 38
+; CHECK-NEXT: v_readlane_b32 s72, v0, 37
+; CHECK-NEXT: v_readlane_b32 s71, v0, 36
+; CHECK-NEXT: v_readlane_b32 s70, v0, 35
+; CHECK-NEXT: v_readlane_b32 s69, v0, 34
+; CHECK-NEXT: v_readlane_b32 s68, v0, 33
+; CHECK-NEXT: v_readlane_b32 s67, v0, 32
+; CHECK-NEXT: v_readlane_b32 s66, v0, 31
+; CHECK-NEXT: v_readlane_b32 s65, v0, 30
+; CHECK-NEXT: v_readlane_b32 s64, v0, 29
+; CHECK-NEXT: v_readlane_b32 s63, v0, 28
+; CHECK-NEXT: v_readlane_b32 s62, v0, 27
+; CHECK-NEXT: v_readlane_b32 s61, v0, 26
+; CHECK-NEXT: v_readlane_b32 s60, v0, 25
+; CHECK-NEXT: v_readlane_b32 s59, v0, 24
+; CHECK-NEXT: v_readlane_b32 s58, v0, 23
+; CHECK-NEXT: v_readlane_b32 s57, v0, 22
+; CHECK-NEXT: v_readlane_b32 s56, v0, 21
+; CHECK-NEXT: v_readlane_b32 s55, v0, 20
+; CHECK-NEXT: v_readlane_b32 s54, v0, 19
+; CHECK-NEXT: v_readlane_b32 s53, v0, 18
+; CHECK-NEXT: v_readlane_b32 s52, v0, 17
+; CHECK-NEXT: v_readlane_b32 s51, v0, 16
+; CHECK-NEXT: v_readlane_b32 s50, v0, 15
+; CHECK-NEXT: v_readlane_b32 s49, v0, 14
+; CHECK-NEXT: v_readlane_b32 s48, v0, 13
+; CHECK-NEXT: v_readlane_b32 s47, v0, 12
+; CHECK-NEXT: v_readlane_b32 s46, v0, 11
+; CHECK-NEXT: v_readlane_b32 s45, v0, 10
+; CHECK-NEXT: v_readlane_b32 s44, v0, 9
+; CHECK-NEXT: v_readlane_b32 s43, v0, 8
+; CHECK-NEXT: v_readlane_b32 s42, v0, 7
+; CHECK-NEXT: v_readlane_b32 s41, v0, 6
+; CHECK-NEXT: v_readlane_b32 s40, v0, 5
+; CHECK-NEXT: v_readlane_b32 s39, v0, 4
+; CHECK-NEXT: v_readlane_b32 s38, v0, 3
+; CHECK-NEXT: v_readlane_b32 s37, v0, 2
+; CHECK-NEXT: v_readlane_b32 s36, v0, 1
+; CHECK-NEXT: v_readlane_b32 s35, v0, 0
+; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[4:5]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
call void asm sideeffect "",
"~{s35},~{s36},~{s37},~{s38},~{s39},~{s40},~{s41},~{s42}
,~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49},~{s50}
@@ -21,13 +166,161 @@ define void @spill_more_than_wavesize_csr_sgprs() {
ret void
}
-; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object:
-; CHECK-DAG: v_writelane_b32 v1, s98, 63
-; CHECK-DAG: v_writelane_b32 v2, s99, 0
-; CHECK-NOT: dummy
-; CHECK-DAG: v_readlane_b32 s99, v2, 0
-; CHECK-DAG: v_readlane_b32 s98, v1, 63
define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
+; CHECK-LABEL: spill_more_than_wavesize_csr_sgprs_with_stack_object:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[4:5]
+; CHECK-NEXT: v_writelane_b32 v1, s35, 0
+; CHECK-NEXT: v_writelane_b32 v1, s36, 1
+; CHECK-NEXT: v_writelane_b32 v1, s37, 2
+; CHECK-NEXT: v_writelane_b32 v1, s38, 3
+; CHECK-NEXT: v_writelane_b32 v1, s39, 4
+; CHECK-NEXT: v_writelane_b32 v1, s40, 5
+; CHECK-NEXT: v_writelane_b32 v1, s41, 6
+; CHECK-NEXT: v_writelane_b32 v1, s42, 7
+; CHECK-NEXT: v_writelane_b32 v1, s43, 8
+; CHECK-NEXT: v_writelane_b32 v1, s44, 9
+; CHECK-NEXT: v_writelane_b32 v1, s45, 10
+; CHECK-NEXT: v_writelane_b32 v1, s46, 11
+; CHECK-NEXT: v_writelane_b32 v1, s47, 12
+; CHECK-NEXT: v_writelane_b32 v1, s48, 13
+; CHECK-NEXT: v_writelane_b32 v1, s49, 14
+; CHECK-NEXT: v_writelane_b32 v1, s50, 15
+; CHECK-NEXT: v_writelane_b32 v1, s51, 16
+; CHECK-NEXT: v_writelane_b32 v1, s52, 17
+; CHECK-NEXT: v_writelane_b32 v1, s53, 18
+; CHECK-NEXT: v_writelane_b32 v1, s54, 19
+; CHECK-NEXT: v_writelane_b32 v1, s55, 20
+; CHECK-NEXT: v_writelane_b32 v1, s56, 21
+; CHECK-NEXT: v_writelane_b32 v1, s57, 22
+; CHECK-NEXT: v_writelane_b32 v1, s58, 23
+; CHECK-NEXT: v_writelane_b32 v1, s59, 24
+; CHECK-NEXT: v_writelane_b32 v1, s60, 25
+; CHECK-NEXT: v_writelane_b32 v1, s61, 26
+; CHECK-NEXT: v_writelane_b32 v1, s62, 27
+; CHECK-NEXT: v_writelane_b32 v1, s63, 28
+; CHECK-NEXT: v_writelane_b32 v1, s64, 29
+; CHECK-NEXT: v_writelane_b32 v1, s65, 30
+; CHECK-NEXT: v_writelane_b32 v1, s66, 31
+; CHECK-NEXT: v_writelane_b32 v1, s67, 32
+; CHECK-NEXT: v_writelane_b32 v1, s68, 33
+; CHECK-NEXT: v_writelane_b32 v1, s69, 34
+; CHECK-NEXT: v_writelane_b32 v1, s70, 35
+; CHECK-NEXT: v_writelane_b32 v1, s71, 36
+; CHECK-NEXT: v_writelane_b32 v1, s72, 37
+; CHECK-NEXT: v_writelane_b32 v1, s73, 38
+; CHECK-NEXT: v_writelane_b32 v1, s74, 39
+; CHECK-NEXT: v_writelane_b32 v1, s75, 40
+; CHECK-NEXT: v_writelane_b32 v1, s76, 41
+; CHECK-NEXT: v_writelane_b32 v1, s77, 42
+; CHECK-NEXT: v_writelane_b32 v1, s78, 43
+; CHECK-NEXT: v_writelane_b32 v1, s79, 44
+; CHECK-NEXT: v_writelane_b32 v1, s80, 45
+; CHECK-NEXT: v_writelane_b32 v1, s81, 46
+; CHECK-NEXT: v_writelane_b32 v1, s82, 47
+; CHECK-NEXT: v_writelane_b32 v1, s83, 48
+; CHECK-NEXT: v_writelane_b32 v1, s84, 49
+; CHECK-NEXT: v_writelane_b32 v1, s85, 50
+; CHECK-NEXT: v_writelane_b32 v1, s86, 51
+; CHECK-NEXT: v_writelane_b32 v1, s87, 52
+; CHECK-NEXT: v_writelane_b32 v1, s88, 53
+; CHECK-NEXT: v_writelane_b32 v1, s89, 54
+; CHECK-NEXT: v_writelane_b32 v1, s90, 55
+; CHECK-NEXT: v_writelane_b32 v1, s91, 56
+; CHECK-NEXT: v_writelane_b32 v1, s92, 57
+; CHECK-NEXT: v_writelane_b32 v1, s93, 58
+; CHECK-NEXT: v_writelane_b32 v1, s94, 59
+; CHECK-NEXT: v_writelane_b32 v1, s95, 60
+; CHECK-NEXT: v_writelane_b32 v2, s99, 0
+; CHECK-NEXT: v_writelane_b32 v1, s96, 61
+; CHECK-NEXT: v_writelane_b32 v2, s100, 1
+; CHECK-NEXT: v_writelane_b32 v1, s97, 62
+; CHECK-NEXT: v_writelane_b32 v2, s101, 2
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
+; CHECK-NEXT: v_writelane_b32 v1, s98, 63
+; CHECK-NEXT: v_writelane_b32 v2, s102, 3
+; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s102, v2, 3
+; CHECK-NEXT: v_readlane_b32 s101, v2, 2
+; CHECK-NEXT: v_readlane_b32 s100, v2, 1
+; CHECK-NEXT: v_readlane_b32 s99, v2, 0
+; CHECK-NEXT: v_readlane_b32 s98, v1, 63
+; CHECK-NEXT: v_readlane_b32 s97, v1, 62
+; CHECK-NEXT: v_readlane_b32 s96, v1, 61
+; CHECK-NEXT: v_readlane_b32 s95, v1, 60
+; CHECK-NEXT: v_readlane_b32 s94, v1, 59
+; CHECK-NEXT: v_readlane_b32 s93, v1, 58
+; CHECK-NEXT: v_readlane_b32 s92, v1, 57
+; CHECK-NEXT: v_readlane_b32 s91, v1, 56
+; CHECK-NEXT: v_readlane_b32 s90, v1, 55
+; CHECK-NEXT: v_readlane_b32 s89, v1, 54
+; CHECK-NEXT: v_readlane_b32 s88, v1, 53
+; CHECK-NEXT: v_readlane_b32 s87, v1, 52
+; CHECK-NEXT: v_readlane_b32 s86, v1, 51
+; CHECK-NEXT: v_readlane_b32 s85, v1, 50
+; CHECK-NEXT: v_readlane_b32 s84, v1, 49
+; CHECK-NEXT: v_readlane_b32 s83, v1, 48
+; CHECK-NEXT: v_readlane_b32 s82, v1, 47
+; CHECK-NEXT: v_readlane_b32 s81, v1, 46
+; CHECK-NEXT: v_readlane_b32 s80, v1, 45
+; CHECK-NEXT: v_readlane_b32 s79, v1, 44
+; CHECK-NEXT: v_readlane_b32 s78, v1, 43
+; CHECK-NEXT: v_readlane_b32 s77, v1, 42
+; CHECK-NEXT: v_readlane_b32 s76, v1, 41
+; CHECK-NEXT: v_readlane_b32 s75, v1, 40
+; CHECK-NEXT: v_readlane_b32 s74, v1, 39
+; CHECK-NEXT: v_readlane_b32 s73, v1, 38
+; CHECK-NEXT: v_readlane_b32 s72, v1, 37
+; CHECK-NEXT: v_readlane_b32 s71, v1, 36
+; CHECK-NEXT: v_readlane_b32 s70, v1, 35
+; CHECK-NEXT: v_readlane_b32 s69, v1, 34
+; CHECK-NEXT: v_readlane_b32 s68, v1, 33
+; CHECK-NEXT: v_readlane_b32 s67, v1, 32
+; CHECK-NEXT: v_readlane_b32 s66, v1, 31
+; CHECK-NEXT: v_readlane_b32 s65, v1, 30
+; CHECK-NEXT: v_readlane_b32 s64, v1, 29
+; CHECK-NEXT: v_readlane_b32 s63, v1, 28
+; CHECK-NEXT: v_readlane_b32 s62, v1, 27
+; CHECK-NEXT: v_readlane_b32 s61, v1, 26
+; CHECK-NEXT: v_readlane_b32 s60, v1, 25
+; CHECK-NEXT: v_readlane_b32 s59, v1, 24
+; CHECK-NEXT: v_readlane_b32 s58, v1, 23
+; CHECK-NEXT: v_readlane_b32 s57, v1, 22
+; CHECK-NEXT: v_readlane_b32 s56, v1, 21
+; CHECK-NEXT: v_readlane_b32 s55, v1, 20
+; CHECK-NEXT: v_readlane_b32 s54, v1, 19
+; CHECK-NEXT: v_readlane_b32 s53, v1, 18
+; CHECK-NEXT: v_readlane_b32 s52, v1, 17
+; CHECK-NEXT: v_readlane_b32 s51, v1, 16
+; CHECK-NEXT: v_readlane_b32 s50, v1, 15
+; CHECK-NEXT: v_readlane_b32 s49, v1, 14
+; CHECK-NEXT: v_readlane_b32 s48, v1, 13
+; CHECK-NEXT: v_readlane_b32 s47, v1, 12
+; CHECK-NEXT: v_readlane_b32 s46, v1, 11
+; CHECK-NEXT: v_readlane_b32 s45, v1, 10
+; CHECK-NEXT: v_readlane_b32 s44, v1, 9
+; CHECK-NEXT: v_readlane_b32 s43, v1, 8
+; CHECK-NEXT: v_readlane_b32 s42, v1, 7
+; CHECK-NEXT: v_readlane_b32 s41, v1, 6
+; CHECK-NEXT: v_readlane_b32 s40, v1, 5
+; CHECK-NEXT: v_readlane_b32 s39, v1, 4
+; CHECK-NEXT: v_readlane_b32 s38, v1, 3
+; CHECK-NEXT: v_readlane_b32 s37, v1, 2
+; CHECK-NEXT: v_readlane_b32 s36, v1, 1
+; CHECK-NEXT: v_readlane_b32 s35, v1, 0
+; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[4:5]
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 0, ptr addrspace(5) %alloca
call void asm sideeffect "",
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index fed60eecc8a8b..0e568e3071e99 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -8,92 +8,168 @@
; 4 byte emergency stack slot
; = 144 bytes with padding between them
-; GCN-LABEL: {{^}}needs_align16_default_stack_align:
-; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0
-; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32
-; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[SCALED_IDX]], [[FRAMEDIFF]]
-
-; GCN-NOT: s32
-
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-
-; GCN-NOT: s32
-
-; GCN: ; ScratchSize: 144
define void @needs_align16_default_stack_align(i32 %idx) #0 {
+; GCN-LABEL: needs_align16_default_stack_align:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s32
+; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_mov_b32_e32 v2, 1
+; GCN-NEXT: v_mov_b32_e32 v1, 4
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v2, 12, v0
+; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v1, 8, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 3
+; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, 4, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 2
+; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN: ; ScratchSize: 144
%alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
%gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %gep0, align 16
ret void
}
-; GCN-LABEL: {{^}}needs_align16_stack_align4:
-; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}}
-; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffffc00
-
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: s_addk_i32 s32, 0x2800{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-
-; GCN: s_mov_b32 s32, s34
-
-; GCN: ; ScratchSize: 160
define void @needs_align16_stack_align4(i32 %idx) #2 {
+; GCN-LABEL: needs_align16_stack_align4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0x3c0
+; GCN-NEXT: s_and_b32 s33, s33, 0xfffffc00
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33
+; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_mov_b32_e32 v2, 1
+; GCN-NEXT: v_mov_b32_e32 v1, 4
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v2, 12, v0
+; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v1, 8, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 3
+; GCN-NEXT: s_mov_b32 s5, s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_addk_i32 s32, 0x2800
+; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, 4, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 2
+; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_mov_b32 s34, s5
+; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN: ; ScratchSize: 160
%alloca.align16 = alloca [8 x <4 x i32>], align 16, addrspace(5)
%gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %gep0, align 16
ret void
}
-; GCN-LABEL: {{^}}needs_align32:
-; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}}
-; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffff800
-
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: v_or_b32_e32 v{{[0-9]+}}, 12
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: s_addk_i32 s32, 0x3000{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: s_mov_b32 s32, s34
-
-; GCN: ; ScratchSize: 192
define void @needs_align32(i32 %idx) #0 {
+; GCN-LABEL: needs_align32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0x7c0
+; GCN-NEXT: s_and_b32 s33, s33, 0xfffff800
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GCN-NEXT: v_lshrrev_b32_e64 v2, 6, s33
+; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v2
+; GCN-NEXT: v_mov_b32_e32 v2, 1
+; GCN-NEXT: v_mov_b32_e32 v1, 4
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v2, 12, v0
+; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v1, 8, v0
+; GCN-NEXT: v_mov_b32_e32 v2, 3
+; GCN-NEXT: s_mov_b32 s5, s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_addk_i32 s32, 0x3000
+; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_or_b32_e32 v0, 4, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 2
+; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_mov_b32 s34, s5
+; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN: ; ScratchSize: 192
%alloca.align16 = alloca [8 x <4 x i32>], align 32, addrspace(5)
%gep0 = getelementptr inbounds [8 x <4 x i32>], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx
store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %gep0, align 32
ret void
}
-; GCN-LABEL: {{^}}force_realign4:
-; GCN: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}}
-; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffffff00
-; GCN: s_addk_i32 s32, 0xd00{{$}}
-
-; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen
-; GCN: s_mov_b32 s32, s34
-
-; GCN: ; ScratchSize: 52
define void @force_realign4(i32 %idx) #1 {
+; GCN-LABEL: force_realign4:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0xc0
+; GCN-NEXT: s_and_b32 s33, s33, 0xffffff00
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s33
+; GCN-NEXT: s_mov_b32 s5, s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_addk_i32 s32, 0xd00
+; GCN-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GCN-NEXT: v_mov_b32_e32 v1, 3
+; GCN-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_mov_b32 s34, s5
+; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN: ; ScratchSize: 52
%alloca.align16 = alloca [8 x i32], align 4, addrspace(5)
%gep0 = getelementptr inbounds [8 x i32], ptr addrspace(5) %alloca.align16, i32 0, i32 %idx
store volatile i32 3, ptr addrspace(5) %gep0, align 4
ret void
}
-; GCN-LABEL: {{^}}kernel_call_align16_from_8:
-; GCN: s_movk_i32 s32, 0x400{{$}}
-; GCN-NOT: s32
-; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
+; GCN-LABEL: kernel_call_align16_from_8:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s0, s0, s17
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_mov_b32 s13, s15
+; GCN-NEXT: s_mov_b32 s12, s14
+; GCN-NEXT: s_getpc_b64 s[14:15]
+; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_mov_b32_e32 v3, 2
+; GCN-NEXT: v_or_b32_e32 v31, v0, v2
+; GCN-NEXT: s_mov_b32 s14, s16
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_movk_i32 s32, 0x400
+; GCN-NEXT: buffer_store_dword v3, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT: s_endpgm
%alloca = alloca i32, align 4, addrspace(5)
store volatile i32 2, ptr addrspace(5) %alloca
call void @needs_align16_default_stack_align(i32 1)
@@ -101,10 +177,32 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 {
}
; The call sequence should keep the stack on call aligned to 4
-; GCN-LABEL: {{^}}kernel_call_align16_from_5:
-; GCN: s_movk_i32 s32, 0x400
-; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_align16_from_5() {
+; GCN-LABEL: kernel_call_align16_from_5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s0, s0, s17
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_mov_b32 s13, s15
+; GCN-NEXT: s_mov_b32 s12, s14
+; GCN-NEXT: s_getpc_b64 s[14:15]
+; GCN-NEXT: s_add_u32 s14, s14, needs_align16_default_stack_align at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_default_stack_align at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_mov_b32_e32 v3, 2
+; GCN-NEXT: v_or_b32_e32 v31, v0, v2
+; GCN-NEXT: s_mov_b32 s14, s16
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_movk_i32 s32, 0x400
+; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT: s_endpgm
%alloca0 = alloca i8, align 1, addrspace(5)
store volatile i8 2, ptr addrspace(5) %alloca0
@@ -112,10 +210,32 @@ define amdgpu_kernel void @kernel_call_align16_from_5() {
ret void
}
-; GCN-LABEL: {{^}}kernel_call_align4_from_5:
-; GCN: s_movk_i32 s32, 0x400
-; GCN: s_swappc_b64
define amdgpu_kernel void @kernel_call_align4_from_5() {
+; GCN-LABEL: kernel_call_align4_from_5:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s0, s0, s17
+; GCN-NEXT: s_addc_u32 s1, s1, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_mov_b32 s13, s15
+; GCN-NEXT: s_mov_b32 s12, s14
+; GCN-NEXT: s_getpc_b64 s[14:15]
+; GCN-NEXT: s_add_u32 s14, s14, needs_align16_stack_align4 at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s15, s15, needs_align16_stack_align4 at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[18:19], s[14:15], 0x0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GCN-NEXT: v_or_b32_e32 v0, v0, v1
+; GCN-NEXT: v_mov_b32_e32 v3, 2
+; GCN-NEXT: v_or_b32_e32 v31, v0, v2
+; GCN-NEXT: s_mov_b32 s14, s16
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_movk_i32 s32, 0x400
+; GCN-NEXT: buffer_store_byte v3, off, s[0:3], 0
+; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT: s_endpgm
%alloca0 = alloca i8, align 1, addrspace(5)
store volatile i8 2, ptr addrspace(5) %alloca0
@@ -123,28 +243,36 @@ define amdgpu_kernel void @kernel_call_align4_from_5() {
ret void
}
-; GCN-LABEL: {{^}}default_realign_align128:
-; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000
-; GCN-NEXT: s_mov_b32 s5, s34
-; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: s_addk_i32 s32, 0x4000
-; GCN-NOT: s33
-; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}}
-; GCN: s_mov_b32 s32, s34
-; GCN: s_mov_b32 s33, [[FP_COPY]]
define void @default_realign_align128(i32 %idx) #0 {
+; GCN-LABEL: default_realign_align128:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GCN-NEXT: s_mov_b32 s5, s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_addk_i32 s32, 0x4000
+; GCN-NEXT: v_mov_b32_e32 v0, 9
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_mov_b32 s34, s5
+; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_setpc_b64 s[30:31]
%alloca.align = alloca i32, align 128, addrspace(5)
store volatile i32 9, ptr addrspace(5) %alloca.align, align 128
ret void
}
-; GCN-LABEL: {{^}}disable_realign_align128:
-; GCN-NOT: s32
-; GCN: buffer_store_dword v0, off, s[0:3], s32{{$}}
-; GCN-NOT: s32
define void @disable_realign_align128(i32 %idx) #3 {
+; GCN-LABEL: disable_realign_align128:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 9
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%alloca.align = alloca i32, align 128, addrspace(5)
store volatile i32 9, ptr addrspace(5) %alloca.align, align 128
ret void
@@ -156,35 +284,48 @@ define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
; since there is a local object with an alignment of 1024.
; Should use BP to access the incoming stack arguments.
; The BP value is saved/restored with a VGPR spill.
-
; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_add_i32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
-; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
-; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[18:19]
-; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2
-; GCN-NEXT: v_mov_b32_e32 v32, 0
-; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3
-; GCN: s_mov_b32 s34, s32
-; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
-; GCN-DAG: s_add_i32 s32, s32, 0x30000
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
-; GCN: s_swappc_b64 s[30:31],
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s16, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0xffc0
+; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000
+; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[18:19]
+; GCN-NEXT: v_writelane_b32 v40, s16, 2
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN-NEXT: v_writelane_b32 v40, s34, 3
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:1024
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s34
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4
+; GCN-NEXT: s_add_i32 s32, s32, 0x30000
+; GCN-NEXT: s_getpc_b64 s[16:17]
+; GCN-NEXT: s_add_u32 s16, s16, extern_func at gotpcrel32@lo+4
+; GCN-NEXT: s_addc_u32 s17, s17, extern_func at gotpcrel32@hi+12
+; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GCN-NEXT: v_writelane_b32 v40, s30, 0
+; GCN-NEXT: v_writelane_b32 v40, s31, 1
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; GCN-NEXT: s_waitcnt vmcnt(1)
+; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT: v_readlane_b32 s31, v40, 1
+; GCN-NEXT: v_readlane_b32 s30, v40, 0
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: v_readlane_b32 s4, v40, 2
+; GCN-NEXT: v_readlane_b32 s34, v40, 3
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
-; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1
-; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0
-; GCN-NEXT: s_mov_b32 s32, s34
-; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2
-; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
-; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]]
-; GCN: s_setpc_b64 s[30:31]
%temp = alloca i32, align 1024, addrspace(5)
store volatile i32 0, ptr addrspace(5) %temp, align 1024
call void @extern_func(<32 x i32> %a, i32 %b)
@@ -198,23 +339,56 @@ define i32 @needs_align1024_stack_args_used_inside_loop(ptr addrspace(5) nocaptu
; index variable, the base pointer first get loaded into a VGPR
; and that value should be further referenced to load the incoming values.
; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue.
-
; GCN-LABEL: needs_align1024_stack_args_used_inside_loop:
-; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_add_i32 s33, s32, 0xffc0
-; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34
-; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000
-; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34
-; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; GCN: s_add_i32 s32, s32, 0x30000
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024
-; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen
-; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]]
-; GCN: s_mov_b32 s32, s34
-; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]]
-; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GCN: ; %bb.0: ; %begin
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s11, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0xffc0
+; GCN-NEXT: s_mov_b32 s14, s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_and_b32 s33, s33, 0xffff0000
+; GCN-NEXT: v_lshrrev_b32_e64 v1, 6, s34
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_mov_b32 s10, 0
+; GCN-NEXT: s_mov_b64 s[4:5], 0
+; GCN-NEXT: s_add_i32 s32, s32, 0x30000
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1024
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ; implicit-def: $sgpr6_sgpr7
+; GCN-NEXT: s_branch .LBB10_2
+; GCN-NEXT: .LBB10_1: ; %Flow
+; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1
+; GCN-NEXT: s_or_b64 exec, exec, s[8:9]
+; GCN-NEXT: s_and_b64 s[8:9], exec, s[6:7]
+; GCN-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_cbranch_execz .LBB10_4
+; GCN-NEXT: .LBB10_2: ; %loop_body
+; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
+; GCN-NEXT: s_or_b64 s[6:7], s[6:7], exec
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, s10, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
+; GCN-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; GCN-NEXT: s_cbranch_execz .LBB10_1
+; GCN-NEXT: ; %bb.3: ; %loop_end
+; GCN-NEXT: ; in Loop: Header=BB10_2 Depth=1
+; GCN-NEXT: s_add_i32 s10, s10, 1
+; GCN-NEXT: s_cmp_eq_u32 s10, 9
+; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec
+; GCN-NEXT: s_and_b64 s[12:13], s[12:13], exec
+; GCN-NEXT: v_add_u32_e32 v1, vcc, 4, v1
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13]
+; GCN-NEXT: s_branch .LBB10_1
+; GCN-NEXT: .LBB10_4: ; %exit
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_mov_b32 s34, s14
+; GCN-NEXT: s_mov_b32 s33, s11
+; GCN-NEXT: s_setpc_b64 s[30:31]
begin:
%local_var = alloca i32, align 1024, addrspace(5)
store volatile i32 0, ptr addrspace(5) %local_var, align 1024
@@ -239,16 +413,31 @@ exit: ; preds = %loop_end, %loop_b
define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy:
-; GCN: ; %bb.0:
-; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0
-; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
-; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: ;;#ASMSTART
-; GCN-NEXT: ;;#ASMEND
-; GCN: s_setpc_b64 s[30:31]
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 vcc_lo, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 v1, s34, 0
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4
+; GCN-NEXT: s_addk_i32 s32, 0x6000
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: v_readlane_b32 s34, v1, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s33, vcc_lo
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%local_val = alloca i32, align 128, addrspace(5)
store volatile i32 %b, ptr addrspace(5) %local_val, align 128
; Use all clobberable registers, so BP has to spill to a VGPR.
@@ -262,15 +451,172 @@ define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
; If there are no free SGPRs or VGPRs available we must spill the BP to memory.
-
-; GCN-LABEL: no_free_regs_spill_bp_to_mem
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN: s_xor_saveexec_b64 s[6:7], -1
-; GCN: buffer_store_dword v39, off, s[0:3], s33
-; GCN: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
-; GCN: buffer_store_dword v0, off, s[0:3], s33
-; GCN: v_mov_b32_e32 v0, s34
-; GCN-DAG: buffer_store_dword v0, off, s[0:3], s33
+; GCN-LABEL: no_free_regs_spill_bp_to_memory:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4
+; GCN-NEXT: v_writelane_b32 v39, s39, 0
+; GCN-NEXT: v_writelane_b32 v39, s40, 1
+; GCN-NEXT: v_writelane_b32 v39, s41, 2
+; GCN-NEXT: v_writelane_b32 v39, s42, 3
+; GCN-NEXT: v_writelane_b32 v39, s43, 4
+; GCN-NEXT: v_writelane_b32 v39, s44, 5
+; GCN-NEXT: v_writelane_b32 v39, s45, 6
+; GCN-NEXT: v_writelane_b32 v39, s46, 7
+; GCN-NEXT: v_writelane_b32 v39, s47, 8
+; GCN-NEXT: v_writelane_b32 v39, s48, 9
+; GCN-NEXT: v_writelane_b32 v39, s49, 10
+; GCN-NEXT: v_writelane_b32 v39, s50, 11
+; GCN-NEXT: v_writelane_b32 v39, s51, 12
+; GCN-NEXT: v_writelane_b32 v39, s52, 13
+; GCN-NEXT: v_writelane_b32 v39, s53, 14
+; GCN-NEXT: v_writelane_b32 v39, s54, 15
+; GCN-NEXT: v_writelane_b32 v39, s55, 16
+; GCN-NEXT: v_writelane_b32 v39, s56, 17
+; GCN-NEXT: v_writelane_b32 v39, s57, 18
+; GCN-NEXT: v_writelane_b32 v39, s58, 19
+; GCN-NEXT: v_writelane_b32 v39, s59, 20
+; GCN-NEXT: v_writelane_b32 v39, s60, 21
+; GCN-NEXT: v_writelane_b32 v39, s61, 22
+; GCN-NEXT: v_writelane_b32 v39, s62, 23
+; GCN-NEXT: v_writelane_b32 v39, s63, 24
+; GCN-NEXT: v_writelane_b32 v39, s64, 25
+; GCN-NEXT: v_writelane_b32 v39, s65, 26
+; GCN-NEXT: v_writelane_b32 v39, s66, 27
+; GCN-NEXT: v_writelane_b32 v39, s67, 28
+; GCN-NEXT: v_writelane_b32 v39, s68, 29
+; GCN-NEXT: v_writelane_b32 v39, s69, 30
+; GCN-NEXT: v_writelane_b32 v39, s70, 31
+; GCN-NEXT: v_writelane_b32 v39, s71, 32
+; GCN-NEXT: v_writelane_b32 v39, s72, 33
+; GCN-NEXT: v_writelane_b32 v39, s73, 34
+; GCN-NEXT: v_writelane_b32 v39, s74, 35
+; GCN-NEXT: v_writelane_b32 v39, s75, 36
+; GCN-NEXT: v_writelane_b32 v39, s76, 37
+; GCN-NEXT: v_writelane_b32 v39, s77, 38
+; GCN-NEXT: v_writelane_b32 v39, s78, 39
+; GCN-NEXT: v_writelane_b32 v39, s79, 40
+; GCN-NEXT: v_writelane_b32 v39, s80, 41
+; GCN-NEXT: v_writelane_b32 v39, s81, 42
+; GCN-NEXT: v_writelane_b32 v39, s82, 43
+; GCN-NEXT: v_writelane_b32 v39, s83, 44
+; GCN-NEXT: v_writelane_b32 v39, s84, 45
+; GCN-NEXT: v_writelane_b32 v39, s85, 46
+; GCN-NEXT: v_writelane_b32 v39, s86, 47
+; GCN-NEXT: v_writelane_b32 v39, s87, 48
+; GCN-NEXT: v_writelane_b32 v39, s88, 49
+; GCN-NEXT: v_writelane_b32 v39, s89, 50
+; GCN-NEXT: v_writelane_b32 v39, s90, 51
+; GCN-NEXT: v_writelane_b32 v39, s91, 52
+; GCN-NEXT: v_writelane_b32 v39, s92, 53
+; GCN-NEXT: v_writelane_b32 v39, s93, 54
+; GCN-NEXT: v_writelane_b32 v39, s94, 55
+; GCN-NEXT: v_writelane_b32 v39, s95, 56
+; GCN-NEXT: v_writelane_b32 v39, s96, 57
+; GCN-NEXT: v_writelane_b32 v39, s97, 58
+; GCN-NEXT: v_writelane_b32 v39, s98, 59
+; GCN-NEXT: v_writelane_b32 v39, s99, 60
+; GCN-NEXT: v_writelane_b32 v39, s100, 61
+; GCN-NEXT: v_writelane_b32 v39, s101, 62
+; GCN-NEXT: v_writelane_b32 v39, s102, 63
+; GCN-NEXT: s_addk_i32 s32, 0x6000
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:128
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; clobber all VGPRs
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; GCN-NEXT: v_readlane_b32 s102, v39, 63
+; GCN-NEXT: v_readlane_b32 s101, v39, 62
+; GCN-NEXT: v_readlane_b32 s100, v39, 61
+; GCN-NEXT: v_readlane_b32 s99, v39, 60
+; GCN-NEXT: v_readlane_b32 s98, v39, 59
+; GCN-NEXT: v_readlane_b32 s97, v39, 58
+; GCN-NEXT: v_readlane_b32 s96, v39, 57
+; GCN-NEXT: v_readlane_b32 s95, v39, 56
+; GCN-NEXT: v_readlane_b32 s94, v39, 55
+; GCN-NEXT: v_readlane_b32 s93, v39, 54
+; GCN-NEXT: v_readlane_b32 s92, v39, 53
+; GCN-NEXT: v_readlane_b32 s91, v39, 52
+; GCN-NEXT: v_readlane_b32 s90, v39, 51
+; GCN-NEXT: v_readlane_b32 s89, v39, 50
+; GCN-NEXT: v_readlane_b32 s88, v39, 49
+; GCN-NEXT: v_readlane_b32 s87, v39, 48
+; GCN-NEXT: v_readlane_b32 s86, v39, 47
+; GCN-NEXT: v_readlane_b32 s85, v39, 46
+; GCN-NEXT: v_readlane_b32 s84, v39, 45
+; GCN-NEXT: v_readlane_b32 s83, v39, 44
+; GCN-NEXT: v_readlane_b32 s82, v39, 43
+; GCN-NEXT: v_readlane_b32 s81, v39, 42
+; GCN-NEXT: v_readlane_b32 s80, v39, 41
+; GCN-NEXT: v_readlane_b32 s79, v39, 40
+; GCN-NEXT: v_readlane_b32 s78, v39, 39
+; GCN-NEXT: v_readlane_b32 s77, v39, 38
+; GCN-NEXT: v_readlane_b32 s76, v39, 37
+; GCN-NEXT: v_readlane_b32 s75, v39, 36
+; GCN-NEXT: v_readlane_b32 s74, v39, 35
+; GCN-NEXT: v_readlane_b32 s73, v39, 34
+; GCN-NEXT: v_readlane_b32 s72, v39, 33
+; GCN-NEXT: v_readlane_b32 s71, v39, 32
+; GCN-NEXT: v_readlane_b32 s70, v39, 31
+; GCN-NEXT: v_readlane_b32 s69, v39, 30
+; GCN-NEXT: v_readlane_b32 s68, v39, 29
+; GCN-NEXT: v_readlane_b32 s67, v39, 28
+; GCN-NEXT: v_readlane_b32 s66, v39, 27
+; GCN-NEXT: v_readlane_b32 s65, v39, 26
+; GCN-NEXT: v_readlane_b32 s64, v39, 25
+; GCN-NEXT: v_readlane_b32 s63, v39, 24
+; GCN-NEXT: v_readlane_b32 s62, v39, 23
+; GCN-NEXT: v_readlane_b32 s61, v39, 22
+; GCN-NEXT: v_readlane_b32 s60, v39, 21
+; GCN-NEXT: v_readlane_b32 s59, v39, 20
+; GCN-NEXT: v_readlane_b32 s58, v39, 19
+; GCN-NEXT: v_readlane_b32 s57, v39, 18
+; GCN-NEXT: v_readlane_b32 s56, v39, 17
+; GCN-NEXT: v_readlane_b32 s55, v39, 16
+; GCN-NEXT: v_readlane_b32 s54, v39, 15
+; GCN-NEXT: v_readlane_b32 s53, v39, 14
+; GCN-NEXT: v_readlane_b32 s52, v39, 13
+; GCN-NEXT: v_readlane_b32 s51, v39, 12
+; GCN-NEXT: v_readlane_b32 s50, v39, 11
+; GCN-NEXT: v_readlane_b32 s49, v39, 10
+; GCN-NEXT: v_readlane_b32 s48, v39, 9
+; GCN-NEXT: v_readlane_b32 s47, v39, 8
+; GCN-NEXT: v_readlane_b32 s46, v39, 7
+; GCN-NEXT: v_readlane_b32 s45, v39, 6
+; GCN-NEXT: v_readlane_b32 s44, v39, 5
+; GCN-NEXT: v_readlane_b32 s43, v39, 4
+; GCN-NEXT: v_readlane_b32 s42, v39, 3
+; GCN-NEXT: v_readlane_b32 s41, v39, 2
+; GCN-NEXT: v_readlane_b32 s40, v39, 1
+; GCN-NEXT: v_readlane_b32 s39, v39, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s34, v0
+; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%local_val = alloca i32, align 128, addrspace(5)
store volatile i32 %b, ptr addrspace(5) %local_val, align 128
@@ -297,22 +643,179 @@ define void @no_free_regs_spill_bp_to_memory(<32 x i32> %a, i32 %b) #5 {
define void @spill_bp_to_memory_scratch_reg_needed_mubuf_offset(<32 x i32> %a, i32 %b, ptr addrspace(5) byval([4096 x i8]) align 4 %arg) #5 {
; If the size of the offset exceeds the MUBUF offset field we need another
; scratch VGPR to hold the offset.
-
-; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset
-; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33
-; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0
-; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000
-; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1
-; GCN-NEXT: s_add_i32 s5, s33, 0x42100
-; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-NEXT: s_mov_b64 exec, s[6:7]
-; GCN-NEXT: v_mov_b32_e32 v0, [[FP_SCRATCH_COPY]]
-; GCN-NEXT: s_add_i32 s5, s33, 0x42200
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
-; GCN-NEXT: v_mov_b32_e32 v0, s34
-; GCN-NEXT: s_add_i32 s5, s33, 0x42300
-; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-LABEL: spill_bp_to_memory_scratch_reg_needed_mubuf_offset:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_add_i32 s33, s32, 0x1fc0
+; GCN-NEXT: s_and_b32 s33, s33, 0xffffe000
+; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1
+; GCN-NEXT: s_add_i32 s5, s33, 0x42100
+; GCN-NEXT: buffer_store_dword v39, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_add_i32 s5, s33, 0x42200
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-NEXT: v_mov_b32_e32 v0, s34
+; GCN-NEXT: s_add_i32 s5, s33, 0x42300
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s5 ; 4-byte Folded Spill
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s34 offset:4
+; GCN-NEXT: v_writelane_b32 v39, s39, 0
+; GCN-NEXT: v_writelane_b32 v39, s40, 1
+; GCN-NEXT: v_writelane_b32 v39, s41, 2
+; GCN-NEXT: v_writelane_b32 v39, s42, 3
+; GCN-NEXT: v_writelane_b32 v39, s43, 4
+; GCN-NEXT: v_writelane_b32 v39, s44, 5
+; GCN-NEXT: v_writelane_b32 v39, s45, 6
+; GCN-NEXT: v_writelane_b32 v39, s46, 7
+; GCN-NEXT: v_writelane_b32 v39, s47, 8
+; GCN-NEXT: v_writelane_b32 v39, s48, 9
+; GCN-NEXT: v_writelane_b32 v39, s49, 10
+; GCN-NEXT: v_writelane_b32 v39, s50, 11
+; GCN-NEXT: v_writelane_b32 v39, s51, 12
+; GCN-NEXT: v_writelane_b32 v39, s52, 13
+; GCN-NEXT: v_writelane_b32 v39, s53, 14
+; GCN-NEXT: v_writelane_b32 v39, s54, 15
+; GCN-NEXT: v_writelane_b32 v39, s55, 16
+; GCN-NEXT: v_writelane_b32 v39, s56, 17
+; GCN-NEXT: v_writelane_b32 v39, s57, 18
+; GCN-NEXT: v_writelane_b32 v39, s58, 19
+; GCN-NEXT: v_writelane_b32 v39, s59, 20
+; GCN-NEXT: v_writelane_b32 v39, s60, 21
+; GCN-NEXT: v_writelane_b32 v39, s61, 22
+; GCN-NEXT: v_writelane_b32 v39, s62, 23
+; GCN-NEXT: v_writelane_b32 v39, s63, 24
+; GCN-NEXT: v_writelane_b32 v39, s64, 25
+; GCN-NEXT: v_writelane_b32 v39, s65, 26
+; GCN-NEXT: v_writelane_b32 v39, s66, 27
+; GCN-NEXT: v_writelane_b32 v39, s67, 28
+; GCN-NEXT: v_writelane_b32 v39, s68, 29
+; GCN-NEXT: v_writelane_b32 v39, s69, 30
+; GCN-NEXT: v_writelane_b32 v39, s70, 31
+; GCN-NEXT: v_writelane_b32 v39, s71, 32
+; GCN-NEXT: v_writelane_b32 v39, s72, 33
+; GCN-NEXT: v_writelane_b32 v39, s73, 34
+; GCN-NEXT: v_writelane_b32 v39, s74, 35
+; GCN-NEXT: v_writelane_b32 v39, s75, 36
+; GCN-NEXT: v_writelane_b32 v39, s76, 37
+; GCN-NEXT: v_writelane_b32 v39, s77, 38
+; GCN-NEXT: v_writelane_b32 v39, s78, 39
+; GCN-NEXT: v_writelane_b32 v39, s79, 40
+; GCN-NEXT: v_writelane_b32 v39, s80, 41
+; GCN-NEXT: v_writelane_b32 v39, s81, 42
+; GCN-NEXT: v_writelane_b32 v39, s82, 43
+; GCN-NEXT: v_writelane_b32 v39, s83, 44
+; GCN-NEXT: v_writelane_b32 v39, s84, 45
+; GCN-NEXT: v_writelane_b32 v39, s85, 46
+; GCN-NEXT: v_writelane_b32 v39, s86, 47
+; GCN-NEXT: v_writelane_b32 v39, s87, 48
+; GCN-NEXT: v_writelane_b32 v39, s88, 49
+; GCN-NEXT: v_writelane_b32 v39, s89, 50
+; GCN-NEXT: v_writelane_b32 v39, s90, 51
+; GCN-NEXT: v_writelane_b32 v39, s91, 52
+; GCN-NEXT: v_writelane_b32 v39, s92, 53
+; GCN-NEXT: v_writelane_b32 v39, s93, 54
+; GCN-NEXT: v_writelane_b32 v39, s94, 55
+; GCN-NEXT: v_writelane_b32 v39, s95, 56
+; GCN-NEXT: v_writelane_b32 v39, s96, 57
+; GCN-NEXT: v_writelane_b32 v39, s97, 58
+; GCN-NEXT: v_writelane_b32 v39, s98, 59
+; GCN-NEXT: v_writelane_b32 v39, s99, 60
+; GCN-NEXT: v_writelane_b32 v39, s100, 61
+; GCN-NEXT: v_writelane_b32 v39, s101, 62
+; GCN-NEXT: v_mov_b32_e32 v1, 0x1080
+; GCN-NEXT: v_writelane_b32 v39, s102, 63
+; GCN-NEXT: s_add_i32 s32, s32, 0x46000
+; GCN-NEXT: s_mov_b32 s32, s34
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; clobber nonpreserved SGPRs and 64 CSRs
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; clobber all VGPRs
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: s_add_i32 s5, s33, 0x42200
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GCN-NEXT: s_add_i32 s5, s33, 0x42300
+; GCN-NEXT: v_readlane_b32 s102, v39, 63
+; GCN-NEXT: v_readlane_b32 s101, v39, 62
+; GCN-NEXT: v_readlane_b32 s100, v39, 61
+; GCN-NEXT: v_readlane_b32 s99, v39, 60
+; GCN-NEXT: v_readlane_b32 s98, v39, 59
+; GCN-NEXT: v_readlane_b32 s97, v39, 58
+; GCN-NEXT: v_readlane_b32 s96, v39, 57
+; GCN-NEXT: v_readlane_b32 s95, v39, 56
+; GCN-NEXT: v_readlane_b32 s94, v39, 55
+; GCN-NEXT: v_readlane_b32 s93, v39, 54
+; GCN-NEXT: v_readlane_b32 s92, v39, 53
+; GCN-NEXT: v_readlane_b32 s91, v39, 52
+; GCN-NEXT: v_readlane_b32 s90, v39, 51
+; GCN-NEXT: v_readlane_b32 s89, v39, 50
+; GCN-NEXT: v_readlane_b32 s88, v39, 49
+; GCN-NEXT: v_readlane_b32 s87, v39, 48
+; GCN-NEXT: v_readlane_b32 s86, v39, 47
+; GCN-NEXT: v_readlane_b32 s85, v39, 46
+; GCN-NEXT: v_readlane_b32 s84, v39, 45
+; GCN-NEXT: v_readlane_b32 s83, v39, 44
+; GCN-NEXT: v_readlane_b32 s82, v39, 43
+; GCN-NEXT: v_readlane_b32 s81, v39, 42
+; GCN-NEXT: v_readlane_b32 s80, v39, 41
+; GCN-NEXT: v_readlane_b32 s79, v39, 40
+; GCN-NEXT: v_readlane_b32 s78, v39, 39
+; GCN-NEXT: v_readlane_b32 s77, v39, 38
+; GCN-NEXT: v_readlane_b32 s76, v39, 37
+; GCN-NEXT: v_readlane_b32 s75, v39, 36
+; GCN-NEXT: v_readlane_b32 s74, v39, 35
+; GCN-NEXT: v_readlane_b32 s73, v39, 34
+; GCN-NEXT: v_readlane_b32 s72, v39, 33
+; GCN-NEXT: v_readlane_b32 s71, v39, 32
+; GCN-NEXT: v_readlane_b32 s70, v39, 31
+; GCN-NEXT: v_readlane_b32 s69, v39, 30
+; GCN-NEXT: v_readlane_b32 s68, v39, 29
+; GCN-NEXT: v_readlane_b32 s67, v39, 28
+; GCN-NEXT: v_readlane_b32 s66, v39, 27
+; GCN-NEXT: v_readlane_b32 s65, v39, 26
+; GCN-NEXT: v_readlane_b32 s64, v39, 25
+; GCN-NEXT: v_readlane_b32 s63, v39, 24
+; GCN-NEXT: v_readlane_b32 s62, v39, 23
+; GCN-NEXT: v_readlane_b32 s61, v39, 22
+; GCN-NEXT: v_readlane_b32 s60, v39, 21
+; GCN-NEXT: v_readlane_b32 s59, v39, 20
+; GCN-NEXT: v_readlane_b32 s58, v39, 19
+; GCN-NEXT: v_readlane_b32 s57, v39, 18
+; GCN-NEXT: v_readlane_b32 s56, v39, 17
+; GCN-NEXT: v_readlane_b32 s55, v39, 16
+; GCN-NEXT: v_readlane_b32 s54, v39, 15
+; GCN-NEXT: v_readlane_b32 s53, v39, 14
+; GCN-NEXT: v_readlane_b32 s52, v39, 13
+; GCN-NEXT: v_readlane_b32 s51, v39, 12
+; GCN-NEXT: v_readlane_b32 s50, v39, 11
+; GCN-NEXT: v_readlane_b32 s49, v39, 10
+; GCN-NEXT: v_readlane_b32 s48, v39, 9
+; GCN-NEXT: v_readlane_b32 s47, v39, 8
+; GCN-NEXT: v_readlane_b32 s46, v39, 7
+; GCN-NEXT: v_readlane_b32 s45, v39, 6
+; GCN-NEXT: v_readlane_b32 s44, v39, 5
+; GCN-NEXT: v_readlane_b32 s43, v39, 4
+; GCN-NEXT: v_readlane_b32 s42, v39, 3
+; GCN-NEXT: v_readlane_b32 s41, v39, 2
+; GCN-NEXT: v_readlane_b32 s40, v39, 1
+; GCN-NEXT: v_readlane_b32 s39, v39, 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s4, v0
+; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s5 ; 4-byte Folded Reload
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_readfirstlane_b32 s34, v0
+; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1
+; GCN-NEXT: s_add_i32 s5, s33, 0x42100
+; GCN-NEXT: buffer_load_dword v39, off, s[0:3], s5 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+; GCN-NEXT: s_mov_b32 s33, s4
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%local_val = alloca i32, align 128, addrspace(5)
store volatile i32 %b, ptr addrspace(5) %local_val, align 128
More information about the llvm-commits
mailing list