[llvm] [AMDGPU] Generate waterfall for calls with SGPR(inreg) argument (PR #146997)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Jul 20 19:42:24 PDT 2025
================
@@ -1,22 +1,452 @@
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs=0 -filetype=null %s 2>&1 | FileCheck -enable-var-scope %s
-
-; CHECK: illegal VGPR to SGPR copy
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope %s
declare hidden void @external_void_func_a15i32_inreg([15 x i32] inreg) #0
declare hidden void @external_void_func_a16i32_inreg([16 x i32] inreg) #0
declare hidden void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg, i32 inreg) #0
define void @test_call_external_void_func_a15i32_inreg([15 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s40, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[42:43]
+; CHECK-NEXT: v_writelane_b32 v40, s40, 32
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: v_writelane_b32 v40, s34, 2
+; CHECK-NEXT: v_writelane_b32 v40, s35, 3
+; CHECK-NEXT: v_writelane_b32 v40, s36, 4
+; CHECK-NEXT: v_writelane_b32 v40, s37, 5
+; CHECK-NEXT: v_writelane_b32 v40, s38, 6
+; CHECK-NEXT: v_writelane_b32 v40, s39, 7
+; CHECK-NEXT: v_writelane_b32 v40, s48, 8
+; CHECK-NEXT: v_writelane_b32 v40, s49, 9
+; CHECK-NEXT: v_writelane_b32 v40, s50, 10
+; CHECK-NEXT: v_writelane_b32 v40, s51, 11
+; CHECK-NEXT: v_writelane_b32 v40, s52, 12
+; CHECK-NEXT: v_writelane_b32 v40, s53, 13
+; CHECK-NEXT: v_writelane_b32 v40, s54, 14
+; CHECK-NEXT: v_writelane_b32 v40, s55, 15
+; CHECK-NEXT: v_writelane_b32 v40, s64, 16
+; CHECK-NEXT: v_writelane_b32 v40, s65, 17
+; CHECK-NEXT: v_writelane_b32 v40, s66, 18
+; CHECK-NEXT: v_writelane_b32 v40, s67, 19
+; CHECK-NEXT: v_writelane_b32 v40, s68, 20
+; CHECK-NEXT: v_writelane_b32 v40, s69, 21
+; CHECK-NEXT: v_writelane_b32 v40, s70, 22
+; CHECK-NEXT: v_writelane_b32 v40, s71, 23
+; CHECK-NEXT: v_writelane_b32 v40, s80, 24
+; CHECK-NEXT: v_writelane_b32 v40, s81, 25
+; CHECK-NEXT: v_writelane_b32 v40, s82, 26
+; CHECK-NEXT: v_writelane_b32 v40, s83, 27
+; CHECK-NEXT: v_writelane_b32 v40, s84, 28
+; CHECK-NEXT: v_writelane_b32 v40, s85, 29
+; CHECK-NEXT: v_writelane_b32 v40, s86, 30
+; CHECK-NEXT: s_mov_b32 s50, s29
+; CHECK-NEXT: s_mov_b32 s51, s28
+; CHECK-NEXT: s_mov_b32 s52, s27
+; CHECK-NEXT: s_mov_b32 s53, s26
+; CHECK-NEXT: s_mov_b32 s54, s25
+; CHECK-NEXT: s_mov_b32 s55, s24
+; CHECK-NEXT: s_mov_b32 s64, s23
+; CHECK-NEXT: s_mov_b32 s65, s22
+; CHECK-NEXT: s_mov_b32 s66, s21
+; CHECK-NEXT: s_mov_b32 s67, s20
+; CHECK-NEXT: s_mov_b32 s68, s19
+; CHECK-NEXT: s_mov_b32 s69, s18
+; CHECK-NEXT: s_mov_b32 s70, s17
+; CHECK-NEXT: s_mov_b32 s71, s16
+; CHECK-NEXT: s_mov_b32 s80, s15
+; CHECK-NEXT: s_mov_b32 s81, s14
+; CHECK-NEXT: s_mov_b32 s82, s13
+; CHECK-NEXT: s_mov_b32 s83, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[84:85], exec
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s87, 31
+; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_readfirstlane_b32 s26, v0
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0
+; CHECK-NEXT: s_and_saveexec_b64 s[86:87], vcc
+; CHECK-NEXT: s_getpc_b64 s[28:29]
+; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a15i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a15i32_inreg at rel32@hi+12
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s83
+; CHECK-NEXT: s_mov_b32 s13, s82
+; CHECK-NEXT: s_mov_b32 s14, s81
+; CHECK-NEXT: s_mov_b32 s15, s80
+; CHECK-NEXT: s_mov_b32 s0, s71
+; CHECK-NEXT: s_mov_b32 s1, s70
+; CHECK-NEXT: s_mov_b32 s2, s69
+; CHECK-NEXT: s_mov_b32 s3, s68
+; CHECK-NEXT: s_mov_b32 s16, s67
+; CHECK-NEXT: s_mov_b32 s17, s66
+; CHECK-NEXT: s_mov_b32 s18, s65
+; CHECK-NEXT: s_mov_b32 s19, s64
+; CHECK-NEXT: s_mov_b32 s20, s55
+; CHECK-NEXT: s_mov_b32 s21, s54
+; CHECK-NEXT: s_mov_b32 s22, s53
+; CHECK-NEXT: s_mov_b32 s23, s52
+; CHECK-NEXT: s_mov_b32 s24, s51
+; CHECK-NEXT: s_mov_b32 s25, s50
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29]
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr31
+; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87]
+; CHECK-NEXT: s_cbranch_execnz .LBB0_1
+; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_mov_b64 exec, s[84:85]
+; CHECK-NEXT: v_readlane_b32 s87, v40, 31
+; CHECK-NEXT: v_readlane_b32 s86, v40, 30
+; CHECK-NEXT: v_readlane_b32 s85, v40, 29
+; CHECK-NEXT: v_readlane_b32 s84, v40, 28
+; CHECK-NEXT: v_readlane_b32 s83, v40, 27
+; CHECK-NEXT: v_readlane_b32 s82, v40, 26
+; CHECK-NEXT: v_readlane_b32 s81, v40, 25
+; CHECK-NEXT: v_readlane_b32 s80, v40, 24
+; CHECK-NEXT: v_readlane_b32 s71, v40, 23
+; CHECK-NEXT: v_readlane_b32 s70, v40, 22
+; CHECK-NEXT: v_readlane_b32 s69, v40, 21
+; CHECK-NEXT: v_readlane_b32 s68, v40, 20
+; CHECK-NEXT: v_readlane_b32 s67, v40, 19
+; CHECK-NEXT: v_readlane_b32 s66, v40, 18
+; CHECK-NEXT: v_readlane_b32 s65, v40, 17
+; CHECK-NEXT: v_readlane_b32 s64, v40, 16
+; CHECK-NEXT: v_readlane_b32 s55, v40, 15
+; CHECK-NEXT: v_readlane_b32 s54, v40, 14
+; CHECK-NEXT: v_readlane_b32 s53, v40, 13
+; CHECK-NEXT: v_readlane_b32 s52, v40, 12
+; CHECK-NEXT: v_readlane_b32 s51, v40, 11
+; CHECK-NEXT: v_readlane_b32 s50, v40, 10
+; CHECK-NEXT: v_readlane_b32 s49, v40, 9
+; CHECK-NEXT: v_readlane_b32 s48, v40, 8
+; CHECK-NEXT: v_readlane_b32 s39, v40, 7
+; CHECK-NEXT: v_readlane_b32 s38, v40, 6
+; CHECK-NEXT: v_readlane_b32 s37, v40, 5
+; CHECK-NEXT: v_readlane_b32 s36, v40, 4
+; CHECK-NEXT: v_readlane_b32 s35, v40, 3
+; CHECK-NEXT: v_readlane_b32 s34, v40, 2
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s4, v40, 32
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_a15i32_inreg([15 x i32] inreg %arg0)
ret void
}
define void @test_call_external_void_func_a16i32_inreg([16 x i32] inreg %arg0) #0 {
+; CHECK-LABEL: test_call_external_void_func_a16i32_inreg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s40, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[42:43]
+; CHECK-NEXT: v_writelane_b32 v40, s40, 32
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: v_writelane_b32 v40, s34, 2
+; CHECK-NEXT: v_writelane_b32 v40, s35, 3
+; CHECK-NEXT: v_writelane_b32 v40, s36, 4
+; CHECK-NEXT: v_writelane_b32 v40, s37, 5
+; CHECK-NEXT: v_writelane_b32 v40, s38, 6
+; CHECK-NEXT: v_writelane_b32 v40, s39, 7
+; CHECK-NEXT: v_writelane_b32 v40, s48, 8
+; CHECK-NEXT: v_writelane_b32 v40, s49, 9
+; CHECK-NEXT: v_writelane_b32 v40, s50, 10
+; CHECK-NEXT: v_writelane_b32 v40, s51, 11
+; CHECK-NEXT: v_writelane_b32 v40, s52, 12
+; CHECK-NEXT: v_writelane_b32 v40, s53, 13
+; CHECK-NEXT: v_writelane_b32 v40, s54, 14
+; CHECK-NEXT: v_writelane_b32 v40, s55, 15
+; CHECK-NEXT: v_writelane_b32 v40, s64, 16
+; CHECK-NEXT: v_writelane_b32 v40, s65, 17
+; CHECK-NEXT: v_writelane_b32 v40, s66, 18
+; CHECK-NEXT: v_writelane_b32 v40, s67, 19
+; CHECK-NEXT: v_writelane_b32 v40, s68, 20
+; CHECK-NEXT: v_writelane_b32 v40, s69, 21
+; CHECK-NEXT: v_writelane_b32 v40, s70, 22
+; CHECK-NEXT: v_writelane_b32 v40, s71, 23
+; CHECK-NEXT: v_writelane_b32 v40, s80, 24
+; CHECK-NEXT: v_writelane_b32 v40, s81, 25
+; CHECK-NEXT: v_writelane_b32 v40, s82, 26
+; CHECK-NEXT: v_writelane_b32 v40, s83, 27
+; CHECK-NEXT: v_writelane_b32 v40, s84, 28
+; CHECK-NEXT: v_writelane_b32 v40, s85, 29
+; CHECK-NEXT: v_writelane_b32 v40, s86, 30
+; CHECK-NEXT: s_mov_b32 s50, s29
+; CHECK-NEXT: s_mov_b32 s51, s28
+; CHECK-NEXT: s_mov_b32 s52, s27
+; CHECK-NEXT: s_mov_b32 s53, s26
+; CHECK-NEXT: s_mov_b32 s54, s25
+; CHECK-NEXT: s_mov_b32 s55, s24
+; CHECK-NEXT: s_mov_b32 s64, s23
+; CHECK-NEXT: s_mov_b32 s65, s22
+; CHECK-NEXT: s_mov_b32 s66, s21
+; CHECK-NEXT: s_mov_b32 s67, s20
+; CHECK-NEXT: s_mov_b32 s68, s19
+; CHECK-NEXT: s_mov_b32 s69, s18
+; CHECK-NEXT: s_mov_b32 s70, s17
+; CHECK-NEXT: s_mov_b32 s71, s16
+; CHECK-NEXT: s_mov_b32 s80, s15
+; CHECK-NEXT: s_mov_b32 s81, s14
+; CHECK-NEXT: s_mov_b32 s82, s13
+; CHECK-NEXT: s_mov_b32 s83, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[84:85], exec
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s87, 31
+; CHECK-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_readfirstlane_b32 s26, v0
+; CHECK-NEXT: v_readfirstlane_b32 s27, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s27, v1
+; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_and_saveexec_b64 s[86:87], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[28:29]
+; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a16i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a16i32_inreg at rel32@hi+12
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s83
+; CHECK-NEXT: s_mov_b32 s13, s82
+; CHECK-NEXT: s_mov_b32 s14, s81
+; CHECK-NEXT: s_mov_b32 s15, s80
+; CHECK-NEXT: s_mov_b32 s0, s71
+; CHECK-NEXT: s_mov_b32 s1, s70
+; CHECK-NEXT: s_mov_b32 s2, s69
+; CHECK-NEXT: s_mov_b32 s3, s68
+; CHECK-NEXT: s_mov_b32 s16, s67
+; CHECK-NEXT: s_mov_b32 s17, s66
+; CHECK-NEXT: s_mov_b32 s18, s65
+; CHECK-NEXT: s_mov_b32 s19, s64
+; CHECK-NEXT: s_mov_b32 s20, s55
+; CHECK-NEXT: s_mov_b32 s21, s54
+; CHECK-NEXT: s_mov_b32 s22, s53
+; CHECK-NEXT: s_mov_b32 s23, s52
+; CHECK-NEXT: s_mov_b32 s24, s51
+; CHECK-NEXT: s_mov_b32 s25, s50
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29]
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr1
+; CHECK-NEXT: ; implicit-def: $vgpr31
+; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87]
+; CHECK-NEXT: s_cbranch_execnz .LBB1_1
+; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_mov_b64 exec, s[84:85]
+; CHECK-NEXT: v_readlane_b32 s87, v40, 31
+; CHECK-NEXT: v_readlane_b32 s86, v40, 30
+; CHECK-NEXT: v_readlane_b32 s85, v40, 29
+; CHECK-NEXT: v_readlane_b32 s84, v40, 28
+; CHECK-NEXT: v_readlane_b32 s83, v40, 27
+; CHECK-NEXT: v_readlane_b32 s82, v40, 26
+; CHECK-NEXT: v_readlane_b32 s81, v40, 25
+; CHECK-NEXT: v_readlane_b32 s80, v40, 24
+; CHECK-NEXT: v_readlane_b32 s71, v40, 23
+; CHECK-NEXT: v_readlane_b32 s70, v40, 22
+; CHECK-NEXT: v_readlane_b32 s69, v40, 21
+; CHECK-NEXT: v_readlane_b32 s68, v40, 20
+; CHECK-NEXT: v_readlane_b32 s67, v40, 19
+; CHECK-NEXT: v_readlane_b32 s66, v40, 18
+; CHECK-NEXT: v_readlane_b32 s65, v40, 17
+; CHECK-NEXT: v_readlane_b32 s64, v40, 16
+; CHECK-NEXT: v_readlane_b32 s55, v40, 15
+; CHECK-NEXT: v_readlane_b32 s54, v40, 14
+; CHECK-NEXT: v_readlane_b32 s53, v40, 13
+; CHECK-NEXT: v_readlane_b32 s52, v40, 12
+; CHECK-NEXT: v_readlane_b32 s51, v40, 11
+; CHECK-NEXT: v_readlane_b32 s50, v40, 10
+; CHECK-NEXT: v_readlane_b32 s49, v40, 9
+; CHECK-NEXT: v_readlane_b32 s48, v40, 8
+; CHECK-NEXT: v_readlane_b32 s39, v40, 7
+; CHECK-NEXT: v_readlane_b32 s38, v40, 6
+; CHECK-NEXT: v_readlane_b32 s37, v40, 5
+; CHECK-NEXT: v_readlane_b32 s36, v40, 4
+; CHECK-NEXT: v_readlane_b32 s35, v40, 3
+; CHECK-NEXT: v_readlane_b32 s34, v40, 2
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s4, v40, 32
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_a16i32_inreg([16 x i32] inreg %arg0)
ret void
}
define void @test_call_external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1) #0 {
+; CHECK-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: s_mov_b32 s40, s33
+; CHECK-NEXT: s_mov_b32 s33, s32
+; CHECK-NEXT: s_or_saveexec_b64 s[42:43], -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT: s_mov_b64 exec, s[42:43]
+; CHECK-NEXT: v_writelane_b32 v40, s40, 32
+; CHECK-NEXT: v_writelane_b32 v40, s30, 0
+; CHECK-NEXT: v_writelane_b32 v40, s31, 1
+; CHECK-NEXT: v_writelane_b32 v40, s34, 2
+; CHECK-NEXT: v_writelane_b32 v40, s35, 3
+; CHECK-NEXT: v_writelane_b32 v40, s36, 4
+; CHECK-NEXT: v_writelane_b32 v40, s37, 5
+; CHECK-NEXT: v_writelane_b32 v40, s38, 6
+; CHECK-NEXT: v_writelane_b32 v40, s39, 7
+; CHECK-NEXT: v_writelane_b32 v40, s48, 8
+; CHECK-NEXT: v_writelane_b32 v40, s49, 9
+; CHECK-NEXT: v_writelane_b32 v40, s50, 10
+; CHECK-NEXT: v_writelane_b32 v40, s51, 11
+; CHECK-NEXT: v_writelane_b32 v40, s52, 12
+; CHECK-NEXT: v_writelane_b32 v40, s53, 13
+; CHECK-NEXT: v_writelane_b32 v40, s54, 14
+; CHECK-NEXT: v_writelane_b32 v40, s55, 15
+; CHECK-NEXT: v_writelane_b32 v40, s64, 16
+; CHECK-NEXT: v_writelane_b32 v40, s65, 17
+; CHECK-NEXT: v_writelane_b32 v40, s66, 18
+; CHECK-NEXT: v_writelane_b32 v40, s67, 19
+; CHECK-NEXT: v_writelane_b32 v40, s68, 20
+; CHECK-NEXT: v_writelane_b32 v40, s69, 21
+; CHECK-NEXT: v_writelane_b32 v40, s70, 22
+; CHECK-NEXT: v_writelane_b32 v40, s71, 23
+; CHECK-NEXT: v_writelane_b32 v40, s80, 24
+; CHECK-NEXT: v_writelane_b32 v40, s81, 25
+; CHECK-NEXT: v_writelane_b32 v40, s82, 26
+; CHECK-NEXT: v_writelane_b32 v40, s83, 27
+; CHECK-NEXT: v_writelane_b32 v40, s84, 28
+; CHECK-NEXT: v_writelane_b32 v40, s85, 29
+; CHECK-NEXT: v_writelane_b32 v40, s86, 30
+; CHECK-NEXT: s_mov_b32 s50, s29
+; CHECK-NEXT: s_mov_b32 s51, s28
+; CHECK-NEXT: s_mov_b32 s52, s27
+; CHECK-NEXT: s_mov_b32 s53, s26
+; CHECK-NEXT: s_mov_b32 s54, s25
+; CHECK-NEXT: s_mov_b32 s55, s24
+; CHECK-NEXT: s_mov_b32 s64, s23
+; CHECK-NEXT: s_mov_b32 s65, s22
+; CHECK-NEXT: s_mov_b32 s66, s21
+; CHECK-NEXT: s_mov_b32 s67, s20
+; CHECK-NEXT: s_mov_b32 s68, s19
+; CHECK-NEXT: s_mov_b32 s69, s18
+; CHECK-NEXT: s_mov_b32 s70, s17
+; CHECK-NEXT: s_mov_b32 s71, s16
+; CHECK-NEXT: s_mov_b32 s80, s15
+; CHECK-NEXT: s_mov_b32 s81, s14
+; CHECK-NEXT: s_mov_b32 s82, s13
+; CHECK-NEXT: s_mov_b32 s83, s12
+; CHECK-NEXT: s_mov_b64 s[34:35], s[10:11]
+; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9]
+; CHECK-NEXT: s_mov_b64 s[38:39], s[6:7]
+; CHECK-NEXT: s_mov_b64 s[48:49], s[4:5]
+; CHECK-NEXT: s_mov_b64 s[84:85], exec
+; CHECK-NEXT: s_addk_i32 s32, 0x400
+; CHECK-NEXT: v_writelane_b32 v40, s87, 31
+; CHECK-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: v_readfirstlane_b32 s26, v0
+; CHECK-NEXT: v_readfirstlane_b32 s27, v1
+; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s26, v0
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], s27, v1
+; CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; CHECK-NEXT: s_and_saveexec_b64 s[86:87], s[4:5]
+; CHECK-NEXT: s_getpc_b64 s[28:29]
+; CHECK-NEXT: s_add_u32 s28, s28, external_void_func_a15i32_inreg_i32_inreg at rel32@lo+4
+; CHECK-NEXT: s_addc_u32 s29, s29, external_void_func_a15i32_inreg_i32_inreg at rel32@hi+12
+; CHECK-NEXT: s_mov_b64 s[4:5], s[48:49]
+; CHECK-NEXT: s_mov_b64 s[6:7], s[38:39]
+; CHECK-NEXT: s_mov_b64 s[8:9], s[36:37]
+; CHECK-NEXT: s_mov_b64 s[10:11], s[34:35]
+; CHECK-NEXT: s_mov_b32 s12, s83
+; CHECK-NEXT: s_mov_b32 s13, s82
+; CHECK-NEXT: s_mov_b32 s14, s81
+; CHECK-NEXT: s_mov_b32 s15, s80
+; CHECK-NEXT: s_mov_b32 s0, s71
+; CHECK-NEXT: s_mov_b32 s1, s70
+; CHECK-NEXT: s_mov_b32 s2, s69
+; CHECK-NEXT: s_mov_b32 s3, s68
+; CHECK-NEXT: s_mov_b32 s16, s67
+; CHECK-NEXT: s_mov_b32 s17, s66
+; CHECK-NEXT: s_mov_b32 s18, s65
+; CHECK-NEXT: s_mov_b32 s19, s64
+; CHECK-NEXT: s_mov_b32 s20, s55
+; CHECK-NEXT: s_mov_b32 s21, s54
+; CHECK-NEXT: s_mov_b32 s22, s53
+; CHECK-NEXT: s_mov_b32 s23, s52
+; CHECK-NEXT: s_mov_b32 s24, s51
+; CHECK-NEXT: s_mov_b32 s25, s50
+; CHECK-NEXT: s_swappc_b64 s[30:31], s[28:29]
+; CHECK-NEXT: ; implicit-def: $vgpr0
+; CHECK-NEXT: ; implicit-def: $vgpr1
+; CHECK-NEXT: ; implicit-def: $vgpr31
+; CHECK-NEXT: s_xor_b64 exec, exec, s[86:87]
+; CHECK-NEXT: s_cbranch_execnz .LBB2_1
+; CHECK-NEXT: ; %bb.2:
+; CHECK-NEXT: s_mov_b64 exec, s[84:85]
+; CHECK-NEXT: v_readlane_b32 s87, v40, 31
+; CHECK-NEXT: v_readlane_b32 s86, v40, 30
+; CHECK-NEXT: v_readlane_b32 s85, v40, 29
+; CHECK-NEXT: v_readlane_b32 s84, v40, 28
+; CHECK-NEXT: v_readlane_b32 s83, v40, 27
+; CHECK-NEXT: v_readlane_b32 s82, v40, 26
+; CHECK-NEXT: v_readlane_b32 s81, v40, 25
+; CHECK-NEXT: v_readlane_b32 s80, v40, 24
+; CHECK-NEXT: v_readlane_b32 s71, v40, 23
+; CHECK-NEXT: v_readlane_b32 s70, v40, 22
+; CHECK-NEXT: v_readlane_b32 s69, v40, 21
+; CHECK-NEXT: v_readlane_b32 s68, v40, 20
+; CHECK-NEXT: v_readlane_b32 s67, v40, 19
+; CHECK-NEXT: v_readlane_b32 s66, v40, 18
+; CHECK-NEXT: v_readlane_b32 s65, v40, 17
+; CHECK-NEXT: v_readlane_b32 s64, v40, 16
+; CHECK-NEXT: v_readlane_b32 s55, v40, 15
+; CHECK-NEXT: v_readlane_b32 s54, v40, 14
+; CHECK-NEXT: v_readlane_b32 s53, v40, 13
+; CHECK-NEXT: v_readlane_b32 s52, v40, 12
+; CHECK-NEXT: v_readlane_b32 s51, v40, 11
+; CHECK-NEXT: v_readlane_b32 s50, v40, 10
+; CHECK-NEXT: v_readlane_b32 s49, v40, 9
+; CHECK-NEXT: v_readlane_b32 s48, v40, 8
+; CHECK-NEXT: v_readlane_b32 s39, v40, 7
+; CHECK-NEXT: v_readlane_b32 s38, v40, 6
+; CHECK-NEXT: v_readlane_b32 s37, v40, 5
+; CHECK-NEXT: v_readlane_b32 s36, v40, 4
+; CHECK-NEXT: v_readlane_b32 s35, v40, 3
+; CHECK-NEXT: v_readlane_b32 s34, v40, 2
+; CHECK-NEXT: v_readlane_b32 s31, v40, 1
+; CHECK-NEXT: v_readlane_b32 s30, v40, 0
+; CHECK-NEXT: s_mov_b32 s32, s33
+; CHECK-NEXT: v_readlane_b32 s4, v40, 32
+; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1
+; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT: s_mov_b64 exec, s[6:7]
+; CHECK-NEXT: s_mov_b32 s33, s4
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: s_setpc_b64 s[30:31]
call void @external_void_func_a15i32_inreg_i32_inreg([15 x i32] inreg %arg0, i32 inreg %arg1)
----------------
Shoreshen wrote:
Hi @jmmartinez , this seems not going to create copy from vgpr to sgpr, will add multiple operands case in `llvm/test/CodeGen/AMDGPU/tail-call-inreg-arguments.error.ll`
https://github.com/llvm/llvm-project/pull/146997
More information about the llvm-commits
mailing list