[llvm] 9aa026e - [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 9.

Ivan Kosarev via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 15 03:02:15 PDT 2023


Author: Ivan Kosarev
Date: 2023-06-15T11:02:08+01:00
New Revision: 9aa026e9ffc32f1d0531abd306933e249fdc8059

URL: https://github.com/llvm/llvm-project/commit/9aa026e9ffc32f1d0531abd306933e249fdc8059
DIFF: https://github.com/llvm/llvm-project/commit/9aa026e9ffc32f1d0531abd306933e249fdc8059.diff

LOG: [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 9.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D152902

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/call-argument-types.ll
    llvm/test/CodeGen/AMDGPU/calling-conventions.ll
    llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 1d00ab14c0d85..ded7081033f0e 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -1,7 +1,9 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,MESA %s
-; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MESA %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,VI,MESA %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,HSA %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s
 
 declare hidden void @external_void_func_i1(i1) #0
 declare hidden void @external_void_func_i1_signext(i1 signext) #0
@@ -57,221 +59,1422 @@ declare hidden void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(p
 
 declare hidden void @external_void_func_v16i8(<16 x i8>) #0
 
-
 ; FIXME: Should be passing -1
-; GCN-LABEL: {{^}}test_call_external_void_func_i1_imm:
-; MESA: s_mov_b32 s36, SCRATCH_RSRC_DWORD
-
-; MESA-DAG: s_mov_b64 s[0:1], s[36:37]
-
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1 at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1 at rel32@hi+12
-; GCN-DAG: v_mov_b32_e32 v0, 1{{$}}
-; MESA-DAG: s_mov_b64 s[2:3], s[38:39]
-
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
+; VI-LABEL: test_call_external_void_func_i1_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i1_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i1_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i1_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_i1(i1 true)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext:
-
-; HSA: buffer_load_ubyte [[VAR:v[0-9]+]]
-; HSA: s_mov_b32 s32, 0
-; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]]
-; MESA-DAG: s_mov_b32 s32, 0{{$}}
-
-; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_signext at rel32@hi+12
-; GCN-NEXT: v_bfe_i32 v0, [[VAR]], 0, 1
-; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i1_signext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; VI-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i1_signext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; CI-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_signext at rel32@hi+12
+; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_signext at rel32@hi+12
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i1_signext:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_signext at rel32@hi+12
+; HSA-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_signext(i1 signext %var)
   ret void
 }
 
 ; FIXME: load should be scheduled before getpc
-; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext:
-
-; HSA: buffer_load_ubyte [[VAL:v[0-9]+]]
-; HSA-DAG: s_mov_b32 s32, 0{{$}}
-
-; MESA: buffer_load_ubyte [[VAL:v[0-9]+]]
-; MESA-DAG: s_mov_b32 s32, 0{{$}}
-
-; GCN: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i1_zeroext at rel32@hi+12
-; GCN-NEXT: v_and_b32_e32 v0, 1, [[VAL]]
-; GCN-NEXT: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i1_zeroext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; VI-NEXT:    v_and_b32_e32 v0, 1, v0
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i1_zeroext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; CI-NEXT:    v_and_b32_e32 v0, 1, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i1_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i1_zeroext at rel32@hi+12
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i1_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i1_zeroext at rel32@hi+12
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i1_zeroext:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i1_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i1_zeroext at rel32@hi+12
+; HSA-NEXT:    v_and_b32_e32 v0, 1, v0
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %var = load volatile i1, ptr addrspace(1) undef
   call void @external_void_func_i1_zeroext(i1 zeroext %var)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm:
-
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8 at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8 at rel32@hi+12
-; GCN-DAG: v_mov_b32_e32 v0, 0x7b
-
-; GCN-DAG: s_mov_b32 s32, 0{{$}}
-
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i8_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i8_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i8_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i8_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i8_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_i8(i8 123)
   ret void
 }
 
 ; FIXME: don't wait before call
-; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext:
-
-; GCN-DAG: buffer_load_sbyte [[VAL:v[0-9]+]]
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext at rel32@hi+12
-
-; GCN-DAG: s_mov_b32 s32, 0
-
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i8_signext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i8_signext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i8_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_sbyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_signext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i8_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_i8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i8_signext:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0 glc
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8_signext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %var = load volatile i8, ptr addrspace(1) undef
   call void @external_void_func_i8_signext(i8 signext %var)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext:
-
-; GCN-DAG: buffer_load_ubyte [[VAL:v[0-9]+]]
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext at rel32@hi+12
-
-; GCN-DAG: s_mov_b32 s32, 0
-
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i8_zeroext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i8_zeroext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i8_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i8_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i8_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i8_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i8_zeroext:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0 glc
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i8_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i8_zeroext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %var = load volatile i8, ptr addrspace(1) undef
   call void @external_void_func_i8_zeroext(i8 zeroext %var)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
-
-; GCN-DAG: s_mov_b32 s32, 0
-
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
+; VI-LABEL: test_call_external_void_func_i16_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i16_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i16_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i16_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_i16(i16 123)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext:
-
-; GCN-DAG: buffer_load_sshort [[VAL:v[0-9]+]]
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext at rel32@hi+12
-
-; GCN-DAG: s_mov_b32 s32, 0
-
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i16_signext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i16_signext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i16_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_sshort v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_signext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_signext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i16_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_i16 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_signext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_signext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i16_signext:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_sshort v0, off, s[4:7], 0 glc
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16_signext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16_signext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %var = load volatile i16, ptr addrspace(1) undef
   call void @external_void_func_i16_signext(i16 signext %var)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext:
-
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext at rel32@hi+12
-
-; GCN-DAG: s_mov_b32 s32, 0
-
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i16_zeroext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i16_zeroext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i16_zeroext at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i16_zeroext at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i16_zeroext at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i16_zeroext at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i16_zeroext:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_ushort v0, off, s[4:7], 0 glc
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i16_zeroext at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i16_zeroext at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %var = load volatile i16, ptr addrspace(1) undef
   call void @external_void_func_i16_zeroext(i16 zeroext %var)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm:
-
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32 at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32 at rel32@hi+12
-; GCN-DAG: v_mov_b32_e32 v0, 42
-; GCN-DAG: s_mov_b32 s32, 0
-
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
+; VI-LABEL: test_call_external_void_func_i32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 42
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_i32(i32 42)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_i64_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}}
-; GCN-DAG: v_mov_b32_e32 v1, 0{{$}}
-; GCN-DAG: s_getpc_b64 s[[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]]
-; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i64 at rel32@lo+4
-; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i64 at rel32@hi+12
-; GCN: s_swappc_b64 s[30:31], s[[[PC_LO]]:[[PC_HI]]]
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
+; VI-LABEL: test_call_external_void_func_i64_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_i64_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 0x7b
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_i64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_i64_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_i64_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x7b
+; HSA-NEXT:    v_mov_b32_e32 v1, 0
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_i64(i64 123)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2i64:
-; GCN: buffer_load_dwordx4 v[0:3]
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
+; VI-LABEL: test_call_external_void_func_v2i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2i64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b32 s0, 0
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s1, s0
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s5, s4
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2i64:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s8, 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 s9, s8
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <2 x i64>, ptr addrspace(1) null
   call void @external_void_func_v2i64(<2 x i64> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2i64_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1
-; GCN-DAG: v_mov_b32_e32 v1, 2
-; GCN-DAG: v_mov_b32_e32 v2, 3
-; GCN-DAG: v_mov_b32_e32 v3, 4
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v2i64_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    v_mov_b32_e32 v1, 2
+; VI-NEXT:    v_mov_b32_e32 v2, 3
+; VI-NEXT:    v_mov_b32_e32 v3, 4
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2i64_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    v_mov_b32_e32 v1, 2
+; CI-NEXT:    v_mov_b32_e32 v2, 3
+; CI-NEXT:    v_mov_b32_e32 v3, 4
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2i64_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2i64_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    v_mov_b32_e32 v1, 2
+; HSA-NEXT:    v_mov_b32_e32 v2, 3
+; HSA-NEXT:    v_mov_b32_e32 v3, 4
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3i64:
-; GCN: buffer_load_dwordx4 v[0:3]
-; GCN: v_mov_b32_e32 v4, 1
-; GCN: v_mov_b32_e32 v5, 2
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
+; VI-LABEL: test_call_external_void_func_v3i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v4, 1
+; VI-NEXT:    v_mov_b32_e32 v5, 2
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3i64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b32 s0, 0
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s1, s0
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v4, 1
+; CI-NEXT:    v_mov_b32_e32 v5, 2
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v5, 2
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s5, s4
+; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3i64:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s8, 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 s9, s8
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    v_mov_b32_e32 v4, 1
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v5, 2
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 undef>, <3 x i32> <i32 0, i32 1, i32 2>
 
@@ -279,343 +1482,3110 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v4i64:
-; GCN: buffer_load_dwordx4 v[0:3]
-; GCN-DAG: v_mov_b32_e32 v4, 1
-; GCN-DAG: v_mov_b32_e32 v5, 2
-; GCN-DAG: v_mov_b32_e32 v6, 3
-; GCN-DAG: v_mov_b32_e32 v7, 4
-
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
+; VI-LABEL: test_call_external_void_func_v4i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b32 s0, 0
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v4, 1
+; VI-NEXT:    v_mov_b32_e32 v5, 2
+; VI-NEXT:    v_mov_b32_e32 v6, 3
+; VI-NEXT:    v_mov_b32_e32 v7, 4
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v4i64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b32 s0, 0
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_mov_b32 s1, s0
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v4, 1
+; CI-NEXT:    v_mov_b32_e32 v5, 2
+; CI-NEXT:    v_mov_b32_e32 v6, 3
+; CI-NEXT:    v_mov_b32_e32 v7, 4
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v4i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v5, 2
+; GFX9-NEXT:    v_mov_b32_e32 v6, 3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 4
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v4i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s5, s4
+; GFX11-NEXT:    v_dual_mov_b32 v4, 1 :: v_dual_mov_b32 v5, 2
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v4i64:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s8, 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 s9, s8
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    v_mov_b32_e32 v4, 1
+; HSA-NEXT:    v_mov_b32_e32 v5, 2
+; HSA-NEXT:    v_mov_b32_e32 v6, 3
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v7, 4
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %load = load <2 x i64>, ptr addrspace(1) null
   %val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   call void @external_void_func_v4i64(<4 x i64> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_f16_imm:
-; VI: v_mov_b32_e32 v0, 0x4400
-; CI: v_mov_b32_e32 v0, 4.0
-; GCN-NOT: v0
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
+; VI-LABEL: test_call_external_void_func_f16_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0x4400
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_f16_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 4.0
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_f16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_f16_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x4400
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_f16_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x4400
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_f16(half 4.0)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_f32_imm:
-; GCN: v_mov_b32_e32 v0, 4.0
-; GCN-NOT: v0
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_f32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 4.0
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_f32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 4.0
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4.0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_f32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4.0
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_f32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 4.0
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_f32(float 4.0)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2f32_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1.0
-; GCN-DAG: v_mov_b32_e32 v1, 2.0
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v2f32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1.0
+; VI-NEXT:    v_mov_b32_e32 v1, 2.0
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2f32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1.0
+; CI-NEXT:    v_mov_b32_e32 v1, 2.0
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2f32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3f32_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1.0
-; GCN-DAG: v_mov_b32_e32 v1, 2.0
-; GCN-DAG: v_mov_b32_e32 v2, 4.0
-; GCN-NOT: v3,
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v3f32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1.0
+; VI-NEXT:    v_mov_b32_e32 v1, 2.0
+; VI-NEXT:    v_mov_b32_e32 v2, 4.0
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3f32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1.0
+; CI-NEXT:    v_mov_b32_e32 v1, 2.0
+; CI-NEXT:    v_mov_b32_e32 v2, 4.0
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3f32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3f32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v5f32_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1.0
-; GCN-DAG: v_mov_b32_e32 v1, 2.0
-; GCN-DAG: v_mov_b32_e32 v2, 4.0
-; GCN-DAG: v_mov_b32_e32 v3, -1.0
-; GCN-DAG: v_mov_b32_e32 v4, 0.5
-; GCN-NOT: v5,
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v5f32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1.0
+; VI-NEXT:    v_mov_b32_e32 v1, 2.0
+; VI-NEXT:    v_mov_b32_e32 v2, 4.0
+; VI-NEXT:    v_mov_b32_e32 v3, -1.0
+; VI-NEXT:    v_mov_b32_e32 v4, 0.5
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v5f32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1.0
+; CI-NEXT:    v_mov_b32_e32 v1, 2.0
+; CI-NEXT:    v_mov_b32_e32 v2, 4.0
+; CI-NEXT:    v_mov_b32_e32 v3, -1.0
+; CI-NEXT:    v_mov_b32_e32 v4, 0.5
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v3, -1.0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0.5
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v5f32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0.5
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v5f32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v2, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v3, -1.0
+; HSA-NEXT:    v_mov_b32_e32 v4, 0.5
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v5f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v5f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
-; GCN: v_mov_b32_e32 v0, 0{{$}}
-; GCN: v_mov_b32_e32 v1, 0x40100000
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
+; VI-LABEL: test_call_external_void_func_f64_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_f64_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_f64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_f64_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_f64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_f64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_f64_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_f64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_f64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_f64(double 4.0)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2f64_imm:
-; GCN: v_mov_b32_e32 v0, 0{{$}}
-; GCN: v_mov_b32_e32 v1, 2.0
-; GCN: v_mov_b32_e32 v2, 0{{$}}
-; GCN: v_mov_b32_e32 v3, 0x40100000
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v2f64_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 2.0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2f64_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 2.0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2f64_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2f64_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v2, 0
+; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3f64_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v1, 2.0
-; GCN-DAG: v_mov_b32_e32 v2, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v3, 0x40100000
-; GCN-DAG: v_mov_b32_e32 v4, 0{{$}}
-; GCN-DAG: v_mov_b32_e32 v5, 0x40200000
-; GCN-DAG: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v3f64_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 2.0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; VI-NEXT:    v_mov_b32_e32 v4, 0
+; VI-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3f64_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 2.0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; CI-NEXT:    v_mov_b32_e32 v4, 0
+; CI-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f64 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f64 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3f64_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f64 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f64 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3f64_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NEXT:    v_mov_b32_e32 v1, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v2, 0
+; HSA-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; HSA-NEXT:    v_mov_b32_e32 v4, 0
+; HSA-NEXT:    v_mov_b32_e32 v5, 0x40200000
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f64 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f64 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
-; GFX9: buffer_load_dword v0
-; GFX9-NOT: v0
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
+; VI-LABEL: test_call_external_void_func_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2i16:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <2 x i16>, ptr addrspace(1) undef
   call void @external_void_func_v2i16(<2 x i16> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3i16:
-; GFX9: buffer_load_dwordx2 v[0:1]
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
+; VI-LABEL: test_call_external_void_func_v3i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    v_mov_b32_e32 v2, v3
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3i16:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <3 x i16>, ptr addrspace(1) undef
   call void @external_void_func_v3i16(<3 x i16> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3f16:
-; GFX9: buffer_load_dwordx2 v[0:1]
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
+; VI-LABEL: test_call_external_void_func_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[1:2], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3f16:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <3 x half>, ptr addrspace(1) undef
   call void @external_void_func_v3f16(<3 x half> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm:
-; GFX9: v_mov_b32_e32 v0, 0x20001
-; GFX9: v_mov_b32_e32 v1, 3
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v3i16_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0x20001
+; VI-NEXT:    v_mov_b32_e32 v1, 3
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3i16_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    v_mov_b32_e32 v1, 2
+; CI-NEXT:    v_mov_b32_e32 v2, 3
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-NEXT:    v_mov_b32_e32 v1, 3
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3i16_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
+; HSA-NEXT:    v_mov_b32_e32 v1, 3
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3f16_imm:
-; GFX9: v_mov_b32_e32 v0, 0x40003c00
-; GFX9: v_mov_b32_e32 v1, 0x4400
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v3f16_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; VI-NEXT:    v_mov_b32_e32 v1, 0x4400
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3f16_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1.0
+; CI-NEXT:    v_mov_b32_e32 v1, 2.0
+; CI-NEXT:    v_mov_b32_e32 v2, 4.0
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3f16_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x4400
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3f16_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x40003c00
+; HSA-NEXT:    v_mov_b32_e32 v1, 0x4400
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v4i16:
-; GFX9: buffer_load_dwordx2 v[0:1]
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
+; VI-LABEL: test_call_external_void_func_v4i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v4i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; CI-NEXT:    v_mov_b32_e32 v2, v1
+; CI-NEXT:    v_mov_b32_e32 v1, v4
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v4i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v4i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v4i16:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <4 x i16>, ptr addrspace(1) undef
   call void @external_void_func_v4i16(<4 x i16> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v4i16_imm:
-; GFX9-DAG: v_mov_b32_e32 v0, 0x20001
-; GFX9-DAG: v_mov_b32_e32 v1, 0x40003
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v4i16_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 0x20001
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40003
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v4i16_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    v_mov_b32_e32 v1, 2
+; CI-NEXT:    v_mov_b32_e32 v2, 3
+; CI-NEXT:    v_mov_b32_e32 v3, 4
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40003
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v4i16_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x40003
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v4i16_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x20001
+; HSA-NEXT:    v_mov_b32_e32 v1, 0x40003
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2f16:
-; GFX9: buffer_load_dword v0
-; GFX9-NOT: v0
-; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
+; VI-LABEL: test_call_external_void_func_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2f16 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2f16 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2f16 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2f16 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2f16:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2f16 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2f16 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <2 x half>, ptr addrspace(1) undef
   call void @external_void_func_v2f16(<2 x half> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2i32:
-; GCN: buffer_load_dwordx2 v[0:1]
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
+; VI-LABEL: test_call_external_void_func_v2i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <2 x i32>, ptr addrspace(1) undef
   call void @external_void_func_v2i32(<2 x i32> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1
-; GCN-DAG: v_mov_b32_e32 v1, 2
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v2i32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    v_mov_b32_e32 v1, 2
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v2i32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    v_mov_b32_e32 v1, 2
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v2i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v2i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v2i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v2i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v2i32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    v_mov_b32_e32 v1, 2
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v2i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v2i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}}
-
-; GCN-NOT: v3{{$}}
-; GCN-DAG: v_mov_b32_e32 v0, 3
-; GCN-DAG: v_mov_b32_e32 v1, 4
-; GCN-DAG: v_mov_b32_e32 v2, 5
-
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
+; VI-LABEL: test_call_external_void_func_v3i32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    v_mov_b32_e32 v1, 4
+; VI-NEXT:    v_mov_b32_e32 v2, 5
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3i32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 3
+; CI-NEXT:    v_mov_b32_e32 v1, 4
+; CI-NEXT:    v_mov_b32_e32 v2, 5
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 5
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3i32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-NEXT:    v_mov_b32_e32 v2, 5
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3i32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 3
+; HSA-NEXT:    v_mov_b32_e32 v1, 4
+; HSA-NEXT:    v_mov_b32_e32 v2, 5
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32:
-; GCN-DAG: v_mov_b32_e32 v0, 3
-; GCN-DAG: v_mov_b32_e32 v1, 4
-; GCN-DAG: v_mov_b32_e32 v2, 5
-; GCN-DAG: v_mov_b32_e32 v3, 6
 define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
+; VI-LABEL: test_call_external_void_func_v3i32_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    v_mov_b32_e32 v1, 4
+; VI-NEXT:    v_mov_b32_e32 v2, 5
+; VI-NEXT:    v_mov_b32_e32 v3, 6
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v3i32_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 3
+; CI-NEXT:    v_mov_b32_e32 v1, 4
+; CI-NEXT:    v_mov_b32_e32 v2, 5
+; CI-NEXT:    v_mov_b32_e32 v3, 6
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 5
+; GFX9-NEXT:    v_mov_b32_e32 v3, 6
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v3i32_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4
+; GFX11-NEXT:    v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v3i32_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v3i32_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v3i32_i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 3
+; HSA-NEXT:    v_mov_b32_e32 v1, 4
+; HSA-NEXT:    v_mov_b32_e32 v2, 5
+; HSA-NEXT:    v_mov_b32_e32 v3, 6
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v3i32_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v3i32_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
-; GCN: buffer_load_dwordx4 v[0:3]
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
+; VI-LABEL: test_call_external_void_func_v4i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v4i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v4i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v4i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v4i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = load <4 x i32>, ptr addrspace(1) undef
   call void @external_void_func_v4i32(<4 x i32> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1
-; GCN-DAG: v_mov_b32_e32 v1, 2
-; GCN-DAG: v_mov_b32_e32 v2, 3
-; GCN-DAG: v_mov_b32_e32 v3, 4
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v4i32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    v_mov_b32_e32 v1, 2
+; VI-NEXT:    v_mov_b32_e32 v2, 3
+; VI-NEXT:    v_mov_b32_e32 v3, 4
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v4i32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    v_mov_b32_e32 v1, 2
+; CI-NEXT:    v_mov_b32_e32 v2, 3
+; CI-NEXT:    v_mov_b32_e32 v3, 4
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v4i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v4i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v4i32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v4i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v4i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v4i32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    v_mov_b32_e32 v1, 2
+; HSA-NEXT:    v_mov_b32_e32 v2, 3
+; HSA-NEXT:    v_mov_b32_e32 v3, 4
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v4i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v4i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v5i32_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1
-; GCN-DAG: v_mov_b32_e32 v1, 2
-; GCN-DAG: v_mov_b32_e32 v2, 3
-; GCN-DAG: v_mov_b32_e32 v3, 4
-; GCN-DAG: v_mov_b32_e32 v4, 5
-; GCN-NOT: v5,
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v5i32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    v_mov_b32_e32 v1, 2
+; VI-NEXT:    v_mov_b32_e32 v2, 3
+; VI-NEXT:    v_mov_b32_e32 v3, 4
+; VI-NEXT:    v_mov_b32_e32 v4, 5
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v5i32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    v_mov_b32_e32 v1, 2
+; CI-NEXT:    v_mov_b32_e32 v2, 3
+; CI-NEXT:    v_mov_b32_e32 v3, 4
+; CI-NEXT:    v_mov_b32_e32 v4, 5
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 5
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v5i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v5i32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-NEXT:    v_mov_b32_e32 v4, 5
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v5i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v5i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v5i32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    v_mov_b32_e32 v1, 2
+; HSA-NEXT:    v_mov_b32_e32 v2, 3
+; HSA-NEXT:    v_mov_b32_e32 v3, 4
+; HSA-NEXT:    v_mov_b32_e32 v4, 5
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v5i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v5i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v8i32:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
+; VI-LABEL: test_call_external_void_func_v8i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v8i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v8i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v8i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v8i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v8i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v8i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i32>, ptr addrspace(1) %ptr
   call void @external_void_func_v8i32(<8 x i32> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm:
-; GCN-DAG: v_mov_b32_e32 v0, 1
-; GCN-DAG: v_mov_b32_e32 v1, 2
-; GCN-DAG: v_mov_b32_e32 v2, 3
-; GCN-DAG: v_mov_b32_e32 v3, 4
-; GCN-DAG: v_mov_b32_e32 v4, 5
-; GCN-DAG: v_mov_b32_e32 v5, 6
-; GCN-DAG: v_mov_b32_e32 v6, 7
-; GCN-DAG: v_mov_b32_e32 v7, 8
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
+; VI-LABEL: test_call_external_void_func_v8i32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    v_mov_b32_e32 v0, 1
+; VI-NEXT:    v_mov_b32_e32 v1, 2
+; VI-NEXT:    v_mov_b32_e32 v2, 3
+; VI-NEXT:    v_mov_b32_e32 v3, 4
+; VI-NEXT:    v_mov_b32_e32 v4, 5
+; VI-NEXT:    v_mov_b32_e32 v5, 6
+; VI-NEXT:    v_mov_b32_e32 v6, 7
+; VI-NEXT:    v_mov_b32_e32 v7, 8
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v8i32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    v_mov_b32_e32 v0, 1
+; CI-NEXT:    v_mov_b32_e32 v1, 2
+; CI-NEXT:    v_mov_b32_e32 v2, 3
+; CI-NEXT:    v_mov_b32_e32 v3, 4
+; CI-NEXT:    v_mov_b32_e32 v4, 5
+; CI-NEXT:    v_mov_b32_e32 v5, 6
+; CI-NEXT:    v_mov_b32_e32 v6, 7
+; CI-NEXT:    v_mov_b32_e32 v7, 8
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 4
+; GFX9-NEXT:    v_mov_b32_e32 v4, 5
+; GFX9-NEXT:    v_mov_b32_e32 v5, 6
+; GFX9-NEXT:    v_mov_b32_e32 v6, 7
+; GFX9-NEXT:    v_mov_b32_e32 v7, 8
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v8i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v8i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v8i32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2
+; GFX11-NEXT:    v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4
+; GFX11-NEXT:    v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6
+; GFX11-NEXT:    v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v8i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v8i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v8i32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 1
+; HSA-NEXT:    v_mov_b32_e32 v1, 2
+; HSA-NEXT:    v_mov_b32_e32 v2, 3
+; HSA-NEXT:    v_mov_b32_e32 v3, 4
+; HSA-NEXT:    v_mov_b32_e32 v4, 5
+; HSA-NEXT:    v_mov_b32_e32 v5, 6
+; HSA-NEXT:    v_mov_b32_e32 v6, 7
+; HSA-NEXT:    v_mov_b32_e32 v7, 8
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v8i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v8i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v16i32:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
-; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN-NOT: s_waitcnt
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
+; VI-LABEL: test_call_external_void_func_v16i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v16i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v16i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v16i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i32 at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v16i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v16i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v16i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i32>, ptr addrspace(1) %ptr
   call void @external_void_func_v16i32(<16 x i32> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v32i32:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
-; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN-DAG: buffer_load_dwordx4 v[16:19], off
-; GCN-DAG: buffer_load_dwordx4 v[20:23], off
-; GCN-DAG: buffer_load_dwordx4 v[24:27], off
-; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; GCN: buffer_store_dword v31, off, s{{\[[0-9]+:[0-9]+\]}}, s32
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
+; VI-LABEL: test_call_external_void_func_v32i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_getpc_b64 s[8:9]
+; VI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; VI-NEXT:    s_waitcnt vmcnt(7)
+; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; VI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v32i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_getpc_b64 s[8:9]
+; CI-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; CI-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v32i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(7)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v32i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32 at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b32 off, v31, s32
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v32i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
+; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
+; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_getpc_b64 s[12:13]
+; HSA-NEXT:    s_add_u32 s12, s12, external_void_func_v32i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s13, s13, external_void_func_v32i32 at rel32@hi+12
+; HSA-NEXT:    s_waitcnt vmcnt(7)
+; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[12:13]
+; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <32 x i32>, ptr addrspace(1) %ptr
   call void @external_void_func_v32i32(<32 x i32> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32:
-; HSA-NOT: s_add_u32 s32
-
-; MESA-NOT: s_add_u32 s32
-
-; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
-; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN-DAG: buffer_load_dwordx4 v[16:19], off
-; GCN-DAG: buffer_load_dwordx4 v[20:23], off
-; GCN-DAG: buffer_load_dwordx4 v[24:27], off
-; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-
-; GCN: s_waitcnt
-; GCN-DAG: buffer_store_dword [[VAL1]], off, s[{{[0-9]+}}:{{[0-9]+}}], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword v31, off, s[{{[0-9]+}}:{{[0-9]+}}], s32{{$}}
-; GCN: s_swappc_b64
-; GCN-NEXT: s_endpgm
 define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
+; VI-LABEL: test_call_external_void_func_v32i32_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v32, off, s[4:7], 0
+; VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; VI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; VI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; VI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(8)
+; VI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v32i32_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v32, off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; CI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; CI-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; CI-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; CI-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v32, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GFX9-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; GFX9-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; GFX9-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; GFX9-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    buffer_store_dword v32, off, s[36:39], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[36:39], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v32i32_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v32i32_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v32i32_i32 at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[4:7], 0 offset:112
+; GFX11-NEXT:    buffer_load_b32 v32, off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[4:7], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[4:7], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[4:7], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[4:7], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[4:7], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[4:7], 0 offset:96
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_add_i32 s4, s32, 4
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    scratch_store_b32 off, v31, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b32 off, v32, s4
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v32i32_i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NEXT:    buffer_load_dword v32, off, s[8:11], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:112
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
+; HSA-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:32
+; HSA-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0 offset:48
+; HSA-NEXT:    buffer_load_dwordx4 v[16:19], off, s[8:11], 0 offset:64
+; HSA-NEXT:    buffer_load_dwordx4 v[20:23], off, s[8:11], 0 offset:80
+; HSA-NEXT:    buffer_load_dwordx4 v[24:27], off, s[8:11], 0 offset:96
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v32i32_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v32i32_i32 at rel32@hi+12
+; HSA-NEXT:    s_waitcnt vmcnt(8)
+; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; HSA-NEXT:    s_waitcnt vmcnt(8)
+; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
   %val0 = load <32 x i32>, ptr addrspace(1) %ptr0
   %val1 = load i32, ptr addrspace(1) undef
@@ -623,54 +4593,366 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_i32_func_i32_imm:
-; GCN: v_mov_b32_e32 v0, 42
-; GCN: s_swappc_b64 s[30:31],
-; GCN-NOT: s_waitcnt
-; GCN: buffer_store_dword v0, off, s[36:39], 0
 define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %out) #0 {
+; VI-LABEL: test_call_external_i32_func_i32_imm:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s42, -1
+; VI-NEXT:    s_mov_b32 s43, 0xe80000
+; VI-NEXT:    s_add_u32 s40, s40, s5
+; VI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
+; VI-NEXT:    s_addc_u32 s41, s41, 0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; VI-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; VI-NEXT:    v_mov_b32_e32 v0, 42
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_mov_b32 s39, 0xf000
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_i32_func_i32_imm:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s42, -1
+; CI-NEXT:    s_mov_b32 s43, 0xe8f000
+; CI-NEXT:    s_add_u32 s40, s40, s5
+; CI-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x9
+; CI-NEXT:    s_addc_u32 s41, s41, 0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; CI-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; CI-NEXT:    v_mov_b32_e32 v0, 42
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_mov_b32 s39, 0xf000
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_i32_func_i32_imm:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s42, -1
+; GFX9-NEXT:    s_mov_b32 s43, 0xe00000
+; GFX9-NEXT:    s_add_u32 s40, s40, s5
+; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x24
+; GFX9-NEXT:    s_addc_u32 s41, s41, 0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_mov_b32 s39, 0xf000
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_i32_func_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_i32_func_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_i32_func_i32_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[36:37], s[2:3], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 42
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_mov_b32 s39, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s38, -1
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_i32_func_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_i32_func_i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[36:39], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_i32_func_i32_imm:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_load_dwordx2 s[36:37], s[6:7], 0x0
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, 42
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_mov_b32 s39, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s38, -1
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_i32_func_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_i32_func_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    buffer_store_dword v0, off, s[36:39], 0
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_endpgm
   %val = call i32 @external_i32_func_i32(i32 42)
   store volatile i32 %val, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_struct_i8_i32:
-; GCN: buffer_load_ubyte v0, off
-; GCN: buffer_load_dword v1, off
-; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
+; VI-LABEL: test_call_external_void_func_struct_i8_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_struct_i8_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_struct_i8_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[4:7], 0
+; GFX11-NEXT:    buffer_load_b32 v1, off, s[4:7], 0 offset:4
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_struct_i8_i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
+; HSA-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:4
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_struct_i8_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %ptr0 = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i8, i32 }, ptr addrspace(1) %ptr0
   call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32:
-; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
-; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
-; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], 0 offset:8
-; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], 0 offset:12
-
-; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], 0 offset:8
-; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], 0 offset:12
-
-; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], 0 offset:12
-; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8
-
-; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], 0 offset:12
-; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8
-
-; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x400{{$}}
-
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}}
-; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4
-
-; MESA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[36:39], [[SP]]{{$}}
-; MESA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[36:39], [[SP]] offset:4
-
-; GCN-NEXT: s_swappc_b64
-; GCN-NOT: [[SP]]
 define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
+; VI-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    v_mov_b32_e32 v0, 8
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
+; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_movk_i32 s32, 0x400
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 3
+; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    v_mov_b32_e32 v0, 8
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
+; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_movk_i32 s32, 0x400
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_movk_i32 s32, 0x400
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-NEXT:    s_mov_b32 s32, 16
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
+; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    v_mov_b32_e32 v0, 3
+; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    v_mov_b32_e32 v0, 8
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
+; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; HSA-NEXT:    s_movk_i32 s32, 0x400
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_byval_struct_i8_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_byval_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %val = alloca { i8, i32 }, align 8, addrspace(5)
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1
@@ -680,28 +4962,186 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
-; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x800{{$}}
-
-; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3
-; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8
-; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12
-
-; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12
-
-; GCN-NOT: s_add_u32 [[SP]]
-; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}}
-; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4
-; GCN: s_swappc_b64
-; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
-; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:20
-; GCN-NOT: s_sub_u32 [[SP]]
-
-; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off
-; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off
 define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
+; VI-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s5
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    v_mov_b32_e32 v0, 3
+; VI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; VI-NEXT:    v_mov_b32_e32 v0, 8
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
+; VI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; VI-NEXT:    s_movk_i32 s32, 0x800
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; VI-NEXT:    v_mov_b32_e32 v0, 16
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
+; VI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s5
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    v_mov_b32_e32 v0, 3
+; CI-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; CI-NEXT:    v_mov_b32_e32 v0, 8
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
+; CI-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; CI-NEXT:    s_movk_i32 s32, 0x800
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; CI-NEXT:    v_mov_b32_e32 v0, 16
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
+; CI-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s5
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 3
+; GFX9-NEXT:    buffer_store_byte v0, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    buffer_load_dword v0, off, s[36:39], 0 offset:12
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:8
+; GFX9-NEXT:    s_movk_i32 s32, 0x800
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v0, off, s[36:39], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[36:39], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 16
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    buffer_load_ubyte v0, off, s[36:39], 0 offset:16
+; GFX9-NEXT:    buffer_load_dword v1, off, s[36:39], 0 offset:20
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8
+; GFX11-NEXT:    s_mov_b32 s32, 32
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b8 off, v0, off offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v1, off offset:12
+; GFX11-NEXT:    scratch_load_b64 v[0:1], off, off offset:8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b64 off, v[0:1], s32
+; GFX11-NEXT:    v_mov_b32_e32 v0, 16
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_u8 v0, off, off offset:16
+; GFX11-NEXT:    scratch_load_b32 v1, off, off offset:20
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    v_mov_b32_e32 v0, 3
+; HSA-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:8
+; HSA-NEXT:    v_mov_b32_e32 v0, 8
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:12
+; HSA-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:12
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8
+; HSA-NEXT:    s_movk_i32 s32, 0x800
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32 at rel32@hi+12
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; HSA-NEXT:    v_mov_b32_e32 v0, 16
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    buffer_load_ubyte v0, off, s[0:3], 0 offset:16
+; HSA-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:20
+; HSA-NEXT:    s_mov_b32 s7, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s6, -1
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_endpgm
   %in.val = alloca { i8, i32 }, align 8, addrspace(5)
   %out.val = alloca { i8, i32 }, align 8, addrspace(5)
   %in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0
@@ -719,74 +5159,1000 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
   ret void
 }
 
-; GCN-LABEL: {{^}}test_call_external_void_func_v16i8:
 define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
+; VI-LABEL: test_call_external_void_func_v16i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s38, -1
+; VI-NEXT:    s_mov_b32 s39, 0xe80000
+; VI-NEXT:    s_add_u32 s36, s36, s3
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; VI-NEXT:    s_addc_u32 s37, s37, 0
+; VI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; VI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; VI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; VI-NEXT:    v_mov_b32_e32 v4, v1
+; VI-NEXT:    v_mov_b32_e32 v8, v2
+; VI-NEXT:    v_mov_b32_e32 v12, v3
+; VI-NEXT:    v_mov_b32_e32 v1, v16
+; VI-NEXT:    v_mov_b32_e32 v2, v17
+; VI-NEXT:    v_mov_b32_e32 v3, v18
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: test_call_external_void_func_v16i8:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; CI-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s38, -1
+; CI-NEXT:    s_mov_b32 s39, 0xe8f000
+; CI-NEXT:    s_add_u32 s36, s36, s3
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; CI-NEXT:    s_addc_u32 s37, s37, 0
+; CI-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; CI-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; CI-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; CI-NEXT:    v_mov_b32_e32 v4, v1
+; CI-NEXT:    v_mov_b32_e32 v8, v2
+; CI-NEXT:    v_mov_b32_e32 v12, v3
+; CI-NEXT:    v_mov_b32_e32 v1, v16
+; CI-NEXT:    v_mov_b32_e32 v2, v17
+; CI-NEXT:    v_mov_b32_e32 v3, v18
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: test_call_external_void_func_v16i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_mov_b32 s3, 0xf000
+; GFX9-NEXT:    s_mov_b32 s2, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_v16i8 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_v16i8 at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX9-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_mov_b32_e32 v12, v3
+; GFX9-NEXT:    v_mov_b32_e32 v1, v16
+; GFX9-NEXT:    v_mov_b32_e32 v2, v17
+; GFX9-NEXT:    v_mov_b32_e32 v3, v18
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: test_call_external_void_func_v16i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[4:5], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, external_void_func_v16i8 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, external_void_func_v16i8 at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[4:7], 0
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16
+; GFX11-NEXT:    v_mov_b32_e32 v8, v2
+; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
+; GFX11-NEXT:    v_mov_b32_e32 v2, v17
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: test_call_external_void_func_v16i8:
+; HSA:       ; %bb.0:
+; HSA-NEXT:    s_add_i32 s6, s6, s9
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; HSA-NEXT:    s_add_u32 s0, s0, s9
+; HSA-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; HSA-NEXT:    s_mov_b32 s11, 0x1100f000
+; HSA-NEXT:    s_mov_b32 s10, -1
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_getpc_b64 s[8:9]
+; HSA-NEXT:    s_add_u32 s8, s8, external_void_func_v16i8 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s9, s9, external_void_func_v16i8 at rel32@hi+12
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; HSA-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; HSA-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; HSA-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; HSA-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; HSA-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; HSA-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; HSA-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; HSA-NEXT:    v_mov_b32_e32 v4, v1
+; HSA-NEXT:    v_mov_b32_e32 v8, v2
+; HSA-NEXT:    v_mov_b32_e32 v12, v3
+; HSA-NEXT:    v_mov_b32_e32 v1, v16
+; HSA-NEXT:    v_mov_b32_e32 v2, v17
+; HSA-NEXT:    v_mov_b32_e32 v3, v18
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; HSA-NEXT:    s_endpgm
   %ptr = load ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i8>, ptr addrspace(1) %ptr
   call void @external_void_func_v16i8(<16 x i8> %val)
   ret void
 }
 
-; GCN-LABEL: {{^}}stack_passed_arg_alignment_v32i32_f64:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32{{$}}
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4
-; GCN: s_swappc_b64
 define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
+; VI-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s54, -1
+; VI-NEXT:    s_mov_b32 s55, 0xe80000
+; VI-NEXT:    s_add_u32 s52, s52, s5
+; VI-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
+; VI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_addc_u32 s53, s53, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s23
+; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
+; VI-NEXT:    v_mov_b32_e32 v0, s5
+; VI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; VI-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; VI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
+; VI-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; VI-NEXT:    v_mov_b32_e32 v0, s36
+; VI-NEXT:    v_mov_b32_e32 v1, s37
+; VI-NEXT:    v_mov_b32_e32 v2, s38
+; VI-NEXT:    v_mov_b32_e32 v3, s39
+; VI-NEXT:    v_mov_b32_e32 v4, s40
+; VI-NEXT:    v_mov_b32_e32 v5, s41
+; VI-NEXT:    v_mov_b32_e32 v6, s42
+; VI-NEXT:    v_mov_b32_e32 v7, s43
+; VI-NEXT:    v_mov_b32_e32 v8, s44
+; VI-NEXT:    v_mov_b32_e32 v9, s45
+; VI-NEXT:    v_mov_b32_e32 v10, s46
+; VI-NEXT:    v_mov_b32_e32 v11, s47
+; VI-NEXT:    v_mov_b32_e32 v12, s48
+; VI-NEXT:    v_mov_b32_e32 v13, s49
+; VI-NEXT:    v_mov_b32_e32 v14, s50
+; VI-NEXT:    v_mov_b32_e32 v15, s51
+; VI-NEXT:    v_mov_b32_e32 v16, s8
+; VI-NEXT:    v_mov_b32_e32 v17, s9
+; VI-NEXT:    v_mov_b32_e32 v18, s10
+; VI-NEXT:    v_mov_b32_e32 v19, s11
+; VI-NEXT:    v_mov_b32_e32 v20, s12
+; VI-NEXT:    v_mov_b32_e32 v21, s13
+; VI-NEXT:    v_mov_b32_e32 v22, s14
+; VI-NEXT:    v_mov_b32_e32 v23, s15
+; VI-NEXT:    v_mov_b32_e32 v24, s16
+; VI-NEXT:    v_mov_b32_e32 v25, s17
+; VI-NEXT:    v_mov_b32_e32 v26, s18
+; VI-NEXT:    v_mov_b32_e32 v27, s19
+; VI-NEXT:    v_mov_b32_e32 v28, s20
+; VI-NEXT:    v_mov_b32_e32 v29, s21
+; VI-NEXT:    v_mov_b32_e32 v30, s22
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    s_endpgm
+;
+; CI-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; CI-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; CI-NEXT:    s_mov_b32 s54, -1
+; CI-NEXT:    s_mov_b32 s55, 0xe8f000
+; CI-NEXT:    s_add_u32 s52, s52, s5
+; CI-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x19
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x29
+; CI-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x9
+; CI-NEXT:    s_mov_b32 s32, 0
+; CI-NEXT:    s_addc_u32 s53, s53, 0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v0, s23
+; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
+; CI-NEXT:    v_mov_b32_e32 v0, s5
+; CI-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; CI-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; CI-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
+; CI-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; CI-NEXT:    v_mov_b32_e32 v0, s36
+; CI-NEXT:    v_mov_b32_e32 v1, s37
+; CI-NEXT:    v_mov_b32_e32 v2, s38
+; CI-NEXT:    v_mov_b32_e32 v3, s39
+; CI-NEXT:    v_mov_b32_e32 v4, s40
+; CI-NEXT:    v_mov_b32_e32 v5, s41
+; CI-NEXT:    v_mov_b32_e32 v6, s42
+; CI-NEXT:    v_mov_b32_e32 v7, s43
+; CI-NEXT:    v_mov_b32_e32 v8, s44
+; CI-NEXT:    v_mov_b32_e32 v9, s45
+; CI-NEXT:    v_mov_b32_e32 v10, s46
+; CI-NEXT:    v_mov_b32_e32 v11, s47
+; CI-NEXT:    v_mov_b32_e32 v12, s48
+; CI-NEXT:    v_mov_b32_e32 v13, s49
+; CI-NEXT:    v_mov_b32_e32 v14, s50
+; CI-NEXT:    v_mov_b32_e32 v15, s51
+; CI-NEXT:    v_mov_b32_e32 v16, s8
+; CI-NEXT:    v_mov_b32_e32 v17, s9
+; CI-NEXT:    v_mov_b32_e32 v18, s10
+; CI-NEXT:    v_mov_b32_e32 v19, s11
+; CI-NEXT:    v_mov_b32_e32 v20, s12
+; CI-NEXT:    v_mov_b32_e32 v21, s13
+; CI-NEXT:    v_mov_b32_e32 v22, s14
+; CI-NEXT:    v_mov_b32_e32 v23, s15
+; CI-NEXT:    v_mov_b32_e32 v24, s16
+; CI-NEXT:    v_mov_b32_e32 v25, s17
+; CI-NEXT:    v_mov_b32_e32 v26, s18
+; CI-NEXT:    v_mov_b32_e32 v27, s19
+; CI-NEXT:    v_mov_b32_e32 v28, s20
+; CI-NEXT:    v_mov_b32_e32 v29, s21
+; CI-NEXT:    v_mov_b32_e32 v30, s22
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    s_endpgm
+;
+; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s54, -1
+; GFX9-NEXT:    s_mov_b32 s55, 0xe00000
+; GFX9-NEXT:    s_add_u32 s52, s52, s5
+; GFX9-NEXT:    s_load_dwordx16 s[8:23], s[2:3], 0x64
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xa4
+; GFX9-NEXT:    s_load_dwordx16 s[36:51], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_addc_u32 s53, s53, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s23
+; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[52:53]
+; GFX9-NEXT:    buffer_store_dword v0, off, s[52:55], s32 offset:8
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[54:55]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s36
+; GFX9-NEXT:    v_mov_b32_e32 v1, s37
+; GFX9-NEXT:    v_mov_b32_e32 v2, s38
+; GFX9-NEXT:    v_mov_b32_e32 v3, s39
+; GFX9-NEXT:    v_mov_b32_e32 v4, s40
+; GFX9-NEXT:    v_mov_b32_e32 v5, s41
+; GFX9-NEXT:    v_mov_b32_e32 v6, s42
+; GFX9-NEXT:    v_mov_b32_e32 v7, s43
+; GFX9-NEXT:    v_mov_b32_e32 v8, s44
+; GFX9-NEXT:    v_mov_b32_e32 v9, s45
+; GFX9-NEXT:    v_mov_b32_e32 v10, s46
+; GFX9-NEXT:    v_mov_b32_e32 v11, s47
+; GFX9-NEXT:    v_mov_b32_e32 v12, s48
+; GFX9-NEXT:    v_mov_b32_e32 v13, s49
+; GFX9-NEXT:    v_mov_b32_e32 v14, s50
+; GFX9-NEXT:    v_mov_b32_e32 v15, s51
+; GFX9-NEXT:    v_mov_b32_e32 v16, s8
+; GFX9-NEXT:    v_mov_b32_e32 v17, s9
+; GFX9-NEXT:    v_mov_b32_e32 v18, s10
+; GFX9-NEXT:    v_mov_b32_e32 v19, s11
+; GFX9-NEXT:    v_mov_b32_e32 v20, s12
+; GFX9-NEXT:    v_mov_b32_e32 v21, s13
+; GFX9-NEXT:    v_mov_b32_e32 v22, s14
+; GFX9-NEXT:    v_mov_b32_e32 v23, s15
+; GFX9-NEXT:    v_mov_b32_e32 v24, s16
+; GFX9-NEXT:    v_mov_b32_e32 v25, s17
+; GFX9-NEXT:    v_mov_b32_e32 v26, s18
+; GFX9-NEXT:    v_mov_b32_e32 v27, s19
+; GFX9-NEXT:    v_mov_b32_e32 v28, s20
+; GFX9-NEXT:    v_mov_b32_e32 v29, s21
+; GFX9-NEXT:    v_mov_b32_e32 v30, s22
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    s_load_b64 s[20:21], s[2:3], 0xa4
+; GFX11-NEXT:    s_load_b512 s[4:19], s[2:3], 0x64
+; GFX11-NEXT:    s_load_b512 s[36:51], s[2:3], 0x24
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s22, s32, 8
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s21 :: v_dual_mov_b32 v1, s20
+; GFX11-NEXT:    v_mov_b32_e32 v2, s19
+; GFX11-NEXT:    s_add_i32 s19, s32, 4
+; GFX11-NEXT:    v_dual_mov_b32 v4, s40 :: v_dual_mov_b32 v7, s43
+; GFX11-NEXT:    scratch_store_b32 off, v0, s22
+; GFX11-NEXT:    scratch_store_b32 off, v1, s19
+; GFX11-NEXT:    scratch_store_b32 off, v2, s32
+; GFX11-NEXT:    v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v3, s39
+; GFX11-NEXT:    v_dual_mov_b32 v1, s37 :: v_dual_mov_b32 v2, s38
+; GFX11-NEXT:    v_dual_mov_b32 v5, s41 :: v_dual_mov_b32 v6, s42
+; GFX11-NEXT:    v_dual_mov_b32 v9, s45 :: v_dual_mov_b32 v8, s44
+; GFX11-NEXT:    v_dual_mov_b32 v11, s47 :: v_dual_mov_b32 v10, s46
+; GFX11-NEXT:    v_dual_mov_b32 v13, s49 :: v_dual_mov_b32 v12, s48
+; GFX11-NEXT:    v_dual_mov_b32 v15, s51 :: v_dual_mov_b32 v14, s50
+; GFX11-NEXT:    v_dual_mov_b32 v17, s5 :: v_dual_mov_b32 v16, s4
+; GFX11-NEXT:    v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v18, s6
+; GFX11-NEXT:    v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v20, s8
+; GFX11-NEXT:    v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v22, s10
+; GFX11-NEXT:    v_dual_mov_b32 v25, s13 :: v_dual_mov_b32 v24, s12
+; GFX11-NEXT:    v_dual_mov_b32 v27, s15 :: v_dual_mov_b32 v26, s14
+; GFX11-NEXT:    v_dual_mov_b32 v29, s17 :: v_dual_mov_b32 v28, s16
+; GFX11-NEXT:    v_mov_b32_e32 v30, s18
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX11-NEXT:    s_getpc_b64 s[2:3]
+; GFX11-NEXT:    s_add_u32 s2, s2, stack_passed_f64_arg at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s3, s3, stack_passed_f64_arg at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT:    s_endpgm
+;
+; HSA-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; HSA:       ; %bb.0: ; %entry
+; HSA-NEXT:    s_add_i32 s8, s8, s11
+; HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; HSA-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; HSA-NEXT:    s_add_u32 s0, s0, s11
+; HSA-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x40
+; HSA-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x80
+; HSA-NEXT:    s_load_dwordx16 s[36:51], s[6:7], 0x0
+; HSA-NEXT:    s_mov_b32 s32, 0
+; HSA-NEXT:    s_addc_u32 s1, s1, 0
+; HSA-NEXT:    s_waitcnt lgkmcnt(0)
+; HSA-NEXT:    v_mov_b32_e32 v0, s23
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; HSA-NEXT:    v_mov_b32_e32 v0, s24
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; HSA-NEXT:    v_mov_b32_e32 v0, s25
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; HSA-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; HSA-NEXT:    v_mov_b32_e32 v0, s36
+; HSA-NEXT:    v_mov_b32_e32 v1, s37
+; HSA-NEXT:    v_mov_b32_e32 v2, s38
+; HSA-NEXT:    v_mov_b32_e32 v3, s39
+; HSA-NEXT:    v_mov_b32_e32 v4, s40
+; HSA-NEXT:    v_mov_b32_e32 v5, s41
+; HSA-NEXT:    v_mov_b32_e32 v6, s42
+; HSA-NEXT:    v_mov_b32_e32 v7, s43
+; HSA-NEXT:    v_mov_b32_e32 v8, s44
+; HSA-NEXT:    v_mov_b32_e32 v9, s45
+; HSA-NEXT:    v_mov_b32_e32 v10, s46
+; HSA-NEXT:    v_mov_b32_e32 v11, s47
+; HSA-NEXT:    v_mov_b32_e32 v12, s48
+; HSA-NEXT:    v_mov_b32_e32 v13, s49
+; HSA-NEXT:    v_mov_b32_e32 v14, s50
+; HSA-NEXT:    v_mov_b32_e32 v15, s51
+; HSA-NEXT:    v_mov_b32_e32 v16, s8
+; HSA-NEXT:    v_mov_b32_e32 v17, s9
+; HSA-NEXT:    v_mov_b32_e32 v18, s10
+; HSA-NEXT:    v_mov_b32_e32 v19, s11
+; HSA-NEXT:    v_mov_b32_e32 v20, s12
+; HSA-NEXT:    v_mov_b32_e32 v21, s13
+; HSA-NEXT:    v_mov_b32_e32 v22, s14
+; HSA-NEXT:    v_mov_b32_e32 v23, s15
+; HSA-NEXT:    v_mov_b32_e32 v24, s16
+; HSA-NEXT:    v_mov_b32_e32 v25, s17
+; HSA-NEXT:    v_mov_b32_e32 v26, s18
+; HSA-NEXT:    v_mov_b32_e32 v27, s19
+; HSA-NEXT:    v_mov_b32_e32 v28, s20
+; HSA-NEXT:    v_mov_b32_e32 v29, s21
+; HSA-NEXT:    v_mov_b32_e32 v30, s22
+; HSA-NEXT:    s_getpc_b64 s[24:25]
+; HSA-NEXT:    s_add_u32 s24, s24, stack_passed_f64_arg at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s25, s25, stack_passed_f64_arg at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[24:25]
+; HSA-NEXT:    s_endpgm
 entry:
   call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
   ret void
 }
 
-; GCN-LABEL: {{^}}tail_call_byval_align16:
-; GCN-NOT: s32
-; GCN: buffer_load_dword [[VREG2:v[0-9]+]], off, s[0:3], s32 offset:28
-; GCN: buffer_load_dword [[VREG1:v[0-9]+]], off, s[0:3], s32{{$}}
-
-; GCN: s_getpc_b64
-
-; GCN: buffer_store_dword [[VREG2]], off, s[0:3], s32 offset:20
-; GCN: buffer_load_dword [[VREG3:v[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN: buffer_store_dword [[VREG1]], off, s[0:3], s32{{$}}
-; GCN: buffer_store_dword [[VREG3]], off, s[0:3], s32 offset:16{{$}}
-; GCN-NOT: s32
-; GCN: s_setpc_b64
 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
+; VI-LABEL: tail_call_byval_align16:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; VI-NEXT:    s_setpc_b64 s[4:5]
+;
+; CI-LABEL: tail_call_byval_align16:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; CI-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: tail_call_byval_align16:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX11-LABEL: tail_call_byval_align16:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, byval_align16_f64_arg at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, byval_align16_f64_arg at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b32 off, v31, s32
+; GFX11-NEXT:    scratch_load_b64 v[31:32], off, s32 offset:24
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b64 off, v[31:32], s32 offset:16
+; GFX11-NEXT:    s_setpc_b64 s[0:1]
+;
+; HSA-LABEL: tail_call_byval_align16:
+; HSA:       ; %bb.0: ; %entry
+; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:28
+; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, byval_align16_f64_arg at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, byval_align16_f64_arg at rel32@hi+12
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:20
+; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:24
+; HSA-NEXT:    s_waitcnt vmcnt(2)
+; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32
+; HSA-NEXT:    s_waitcnt vmcnt(1)
+; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:16
+; HSA-NEXT:    s_setpc_b64 s[4:5]
 entry:
   %alloca = alloca double, align 8, addrspace(5)
   tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
   ret void
 }
 
-; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
-; GCN-NOT: s32
-; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}}
-; GCN: s_getpc_b64
-; GCN: buffer_store_dword v31, off, s[0:3], s32{{$}}
-; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
-; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8
-; GCN-NOT: s32
-; GCN: s_setpc_b64
 define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {
+; VI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; VI-NEXT:    s_waitcnt vmcnt(2)
+; VI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; VI-NEXT:    s_setpc_b64 s[4:5]
+;
+; CI-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; CI-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX9-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX11-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32
+; GFX11-NEXT:    scratch_load_b64 v[31:32], off, s32 offset:4
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, stack_passed_f64_arg at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, stack_passed_f64_arg at rel32@hi+12
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b32 off, v33, s32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b64 off, v[31:32], s32 offset:4
+; GFX11-NEXT:    s_setpc_b64 s[0:1]
+;
+; HSA-LABEL: tail_call_stack_passed_arg_alignment_v32i32_f64:
+; HSA:       ; %bb.0: ; %entry
+; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HSA-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; HSA-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
+; HSA-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, stack_passed_f64_arg at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, stack_passed_f64_arg at rel32@hi+12
+; HSA-NEXT:    s_waitcnt vmcnt(2)
+; HSA-NEXT:    buffer_store_dword v31, off, s[0:3], s32
+; HSA-NEXT:    s_waitcnt vmcnt(2)
+; HSA-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:4
+; HSA-NEXT:    s_waitcnt vmcnt(2)
+; HSA-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
+; HSA-NEXT:    s_setpc_b64 s[4:5]
 entry:
   tail call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
   ret void
 }
 
-; GCN-LABEL: {{^}}stack_12xv3i32:
-; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
-; GCN: buffer_store_dword [[REG11]], off, s[0:3], s32{{$}}
-; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; GCN: buffer_store_dword [[REG12]], {{.*$}}
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
-; GCN: s_getpc
 define void @stack_12xv3i32() #0 {
+; VI-LABEL: stack_12xv3i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s33
+; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    s_mov_b64 exec, s[8:9]
+; VI-NEXT:    s_addk_i32 s32, 0x400
+; VI-NEXT:    v_mov_b32_e32 v0, 11
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; VI-NEXT:    v_mov_b32_e32 v0, 12
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_mov_b32_e32 v0, 13
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; VI-NEXT:    v_mov_b32_e32 v0, 14
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; VI-NEXT:    v_mov_b32_e32 v0, 15
+; VI-NEXT:    v_writelane_b32 v40, s30, 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 1
+; VI-NEXT:    v_mov_b32_e32 v4, 1
+; VI-NEXT:    v_mov_b32_e32 v5, 1
+; VI-NEXT:    v_mov_b32_e32 v6, 2
+; VI-NEXT:    v_mov_b32_e32 v7, 2
+; VI-NEXT:    v_mov_b32_e32 v8, 2
+; VI-NEXT:    v_mov_b32_e32 v9, 3
+; VI-NEXT:    v_mov_b32_e32 v10, 3
+; VI-NEXT:    v_mov_b32_e32 v11, 3
+; VI-NEXT:    v_mov_b32_e32 v12, 4
+; VI-NEXT:    v_mov_b32_e32 v13, 4
+; VI-NEXT:    v_mov_b32_e32 v14, 4
+; VI-NEXT:    v_mov_b32_e32 v15, 5
+; VI-NEXT:    v_mov_b32_e32 v16, 5
+; VI-NEXT:    v_mov_b32_e32 v17, 5
+; VI-NEXT:    v_mov_b32_e32 v18, 6
+; VI-NEXT:    v_mov_b32_e32 v19, 6
+; VI-NEXT:    v_mov_b32_e32 v20, 6
+; VI-NEXT:    v_mov_b32_e32 v21, 7
+; VI-NEXT:    v_mov_b32_e32 v22, 7
+; VI-NEXT:    v_mov_b32_e32 v23, 7
+; VI-NEXT:    v_mov_b32_e32 v24, 8
+; VI-NEXT:    v_mov_b32_e32 v25, 8
+; VI-NEXT:    v_mov_b32_e32 v26, 8
+; VI-NEXT:    v_mov_b32_e32 v27, 9
+; VI-NEXT:    v_mov_b32_e32 v28, 9
+; VI-NEXT:    v_mov_b32_e32 v29, 9
+; VI-NEXT:    v_mov_b32_e32 v30, 10
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
+; VI-NEXT:    v_writelane_b32 v40, s31, 1
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
+; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
+; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    s_mov_b64 exec, s[6:7]
+; VI-NEXT:    s_addk_i32 s32, 0xfc00
+; VI-NEXT:    s_mov_b32 s33, s4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: stack_12xv3i32:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s33
+; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CI-NEXT:    s_mov_b64 exec, s[8:9]
+; CI-NEXT:    s_addk_i32 s32, 0x400
+; CI-NEXT:    v_mov_b32_e32 v0, 11
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; CI-NEXT:    v_mov_b32_e32 v0, 12
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_mov_b32_e32 v0, 13
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; CI-NEXT:    v_mov_b32_e32 v0, 14
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; CI-NEXT:    v_mov_b32_e32 v0, 15
+; CI-NEXT:    v_writelane_b32 v40, s30, 0
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_mov_b32_e32 v3, 1
+; CI-NEXT:    v_mov_b32_e32 v4, 1
+; CI-NEXT:    v_mov_b32_e32 v5, 1
+; CI-NEXT:    v_mov_b32_e32 v6, 2
+; CI-NEXT:    v_mov_b32_e32 v7, 2
+; CI-NEXT:    v_mov_b32_e32 v8, 2
+; CI-NEXT:    v_mov_b32_e32 v9, 3
+; CI-NEXT:    v_mov_b32_e32 v10, 3
+; CI-NEXT:    v_mov_b32_e32 v11, 3
+; CI-NEXT:    v_mov_b32_e32 v12, 4
+; CI-NEXT:    v_mov_b32_e32 v13, 4
+; CI-NEXT:    v_mov_b32_e32 v14, 4
+; CI-NEXT:    v_mov_b32_e32 v15, 5
+; CI-NEXT:    v_mov_b32_e32 v16, 5
+; CI-NEXT:    v_mov_b32_e32 v17, 5
+; CI-NEXT:    v_mov_b32_e32 v18, 6
+; CI-NEXT:    v_mov_b32_e32 v19, 6
+; CI-NEXT:    v_mov_b32_e32 v20, 6
+; CI-NEXT:    v_mov_b32_e32 v21, 7
+; CI-NEXT:    v_mov_b32_e32 v22, 7
+; CI-NEXT:    v_mov_b32_e32 v23, 7
+; CI-NEXT:    v_mov_b32_e32 v24, 8
+; CI-NEXT:    v_mov_b32_e32 v25, 8
+; CI-NEXT:    v_mov_b32_e32 v26, 8
+; CI-NEXT:    v_mov_b32_e32 v27, 9
+; CI-NEXT:    v_mov_b32_e32 v28, 9
+; CI-NEXT:    v_mov_b32_e32 v29, 9
+; CI-NEXT:    v_mov_b32_e32 v30, 10
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
+; CI-NEXT:    v_writelane_b32 v40, s31, 1
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
+; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
+; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CI-NEXT:    s_mov_b64 exec, s[6:7]
+; CI-NEXT:    s_addk_i32 s32, 0xfc00
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: stack_12xv3i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 14
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 1
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1
+; GFX9-NEXT:    v_mov_b32_e32 v6, 2
+; GFX9-NEXT:    v_mov_b32_e32 v7, 2
+; GFX9-NEXT:    v_mov_b32_e32 v8, 2
+; GFX9-NEXT:    v_mov_b32_e32 v9, 3
+; GFX9-NEXT:    v_mov_b32_e32 v10, 3
+; GFX9-NEXT:    v_mov_b32_e32 v11, 3
+; GFX9-NEXT:    v_mov_b32_e32 v12, 4
+; GFX9-NEXT:    v_mov_b32_e32 v13, 4
+; GFX9-NEXT:    v_mov_b32_e32 v14, 4
+; GFX9-NEXT:    v_mov_b32_e32 v15, 5
+; GFX9-NEXT:    v_mov_b32_e32 v16, 5
+; GFX9-NEXT:    v_mov_b32_e32 v17, 5
+; GFX9-NEXT:    v_mov_b32_e32 v18, 6
+; GFX9-NEXT:    v_mov_b32_e32 v19, 6
+; GFX9-NEXT:    v_mov_b32_e32 v20, 6
+; GFX9-NEXT:    v_mov_b32_e32 v21, 7
+; GFX9-NEXT:    v_mov_b32_e32 v22, 7
+; GFX9-NEXT:    v_mov_b32_e32 v23, 7
+; GFX9-NEXT:    v_mov_b32_e32 v24, 8
+; GFX9-NEXT:    v_mov_b32_e32 v25, 8
+; GFX9-NEXT:    v_mov_b32_e32 v26, 8
+; GFX9-NEXT:    v_mov_b32_e32 v27, 9
+; GFX9-NEXT:    v_mov_b32_e32 v28, 9
+; GFX9-NEXT:    v_mov_b32_e32 v29, 9
+; GFX9-NEXT:    v_mov_b32_e32 v30, 10
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_12xv3i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12
+; GFX11-NEXT:    v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14
+; GFX11-NEXT:    v_mov_b32_e32 v4, 15
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    scratch_store_b32 off, v4, s0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v3, 1 :: v_dual_mov_b32 v2, 0
+; GFX11-NEXT:    v_dual_mov_b32 v5, 1 :: v_dual_mov_b32 v4, 1
+; GFX11-NEXT:    v_dual_mov_b32 v7, 2 :: v_dual_mov_b32 v6, 2
+; GFX11-NEXT:    v_dual_mov_b32 v9, 3 :: v_dual_mov_b32 v8, 2
+; GFX11-NEXT:    v_dual_mov_b32 v11, 3 :: v_dual_mov_b32 v10, 3
+; GFX11-NEXT:    v_dual_mov_b32 v13, 4 :: v_dual_mov_b32 v12, 4
+; GFX11-NEXT:    v_dual_mov_b32 v15, 5 :: v_dual_mov_b32 v14, 4
+; GFX11-NEXT:    v_dual_mov_b32 v17, 5 :: v_dual_mov_b32 v16, 5
+; GFX11-NEXT:    v_dual_mov_b32 v19, 6 :: v_dual_mov_b32 v18, 6
+; GFX11-NEXT:    v_dual_mov_b32 v21, 7 :: v_dual_mov_b32 v20, 6
+; GFX11-NEXT:    v_dual_mov_b32 v23, 7 :: v_dual_mov_b32 v22, 7
+; GFX11-NEXT:    v_dual_mov_b32 v25, 8 :: v_dual_mov_b32 v24, 8
+; GFX11-NEXT:    v_dual_mov_b32 v27, 9 :: v_dual_mov_b32 v26, 8
+; GFX11-NEXT:    v_dual_mov_b32 v29, 9 :: v_dual_mov_b32 v28, 9
+; GFX11-NEXT:    v_mov_b32_e32 v30, 10
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; HSA-LABEL: stack_12xv3i32:
+; HSA:       ; %bb.0: ; %entry
+; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HSA-NEXT:    s_mov_b32 s4, s33
+; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; HSA-NEXT:    s_mov_b64 exec, s[8:9]
+; HSA-NEXT:    s_addk_i32 s32, 0x400
+; HSA-NEXT:    v_mov_b32_e32 v0, 11
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; HSA-NEXT:    v_mov_b32_e32 v0, 12
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; HSA-NEXT:    v_mov_b32_e32 v0, 13
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; HSA-NEXT:    v_mov_b32_e32 v0, 14
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; HSA-NEXT:    v_mov_b32_e32 v0, 15
+; HSA-NEXT:    v_writelane_b32 v40, s30, 0
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; HSA-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NEXT:    v_mov_b32_e32 v1, 0
+; HSA-NEXT:    v_mov_b32_e32 v2, 0
+; HSA-NEXT:    v_mov_b32_e32 v3, 1
+; HSA-NEXT:    v_mov_b32_e32 v4, 1
+; HSA-NEXT:    v_mov_b32_e32 v5, 1
+; HSA-NEXT:    v_mov_b32_e32 v6, 2
+; HSA-NEXT:    v_mov_b32_e32 v7, 2
+; HSA-NEXT:    v_mov_b32_e32 v8, 2
+; HSA-NEXT:    v_mov_b32_e32 v9, 3
+; HSA-NEXT:    v_mov_b32_e32 v10, 3
+; HSA-NEXT:    v_mov_b32_e32 v11, 3
+; HSA-NEXT:    v_mov_b32_e32 v12, 4
+; HSA-NEXT:    v_mov_b32_e32 v13, 4
+; HSA-NEXT:    v_mov_b32_e32 v14, 4
+; HSA-NEXT:    v_mov_b32_e32 v15, 5
+; HSA-NEXT:    v_mov_b32_e32 v16, 5
+; HSA-NEXT:    v_mov_b32_e32 v17, 5
+; HSA-NEXT:    v_mov_b32_e32 v18, 6
+; HSA-NEXT:    v_mov_b32_e32 v19, 6
+; HSA-NEXT:    v_mov_b32_e32 v20, 6
+; HSA-NEXT:    v_mov_b32_e32 v21, 7
+; HSA-NEXT:    v_mov_b32_e32 v22, 7
+; HSA-NEXT:    v_mov_b32_e32 v23, 7
+; HSA-NEXT:    v_mov_b32_e32 v24, 8
+; HSA-NEXT:    v_mov_b32_e32 v25, 8
+; HSA-NEXT:    v_mov_b32_e32 v26, 8
+; HSA-NEXT:    v_mov_b32_e32 v27, 9
+; HSA-NEXT:    v_mov_b32_e32 v28, 9
+; HSA-NEXT:    v_mov_b32_e32 v29, 9
+; HSA-NEXT:    v_mov_b32_e32 v30, 10
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
+; HSA-NEXT:    v_writelane_b32 v40, s31, 1
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
+; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
+; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; HSA-NEXT:    s_mov_b64 exec, s[6:7]
+; HSA-NEXT:    s_addk_i32 s32, 0xfc00
+; HSA-NEXT:    s_mov_b32 s33, s4
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call void @external_void_func_12xv3i32(
       <3 x i32><i32 0, i32 0, i32 0>,
@@ -804,19 +6170,345 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}stack_12xv3f32:
-; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
-; GCN: buffer_store_dword [[REG11]], {{.*$}}
-; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:4
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:8
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:12
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:16
-; GCN: s_getpc
 define void @stack_12xv3f32() #0 {
+; VI-LABEL: stack_12xv3f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s33
+; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    s_mov_b64 exec, s[8:9]
+; VI-NEXT:    s_addk_i32 s32, 0x400
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; VI-NEXT:    v_writelane_b32 v40, s30, 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 1.0
+; VI-NEXT:    v_mov_b32_e32 v4, 1.0
+; VI-NEXT:    v_mov_b32_e32 v5, 1.0
+; VI-NEXT:    v_mov_b32_e32 v6, 2.0
+; VI-NEXT:    v_mov_b32_e32 v7, 2.0
+; VI-NEXT:    v_mov_b32_e32 v8, 2.0
+; VI-NEXT:    v_mov_b32_e32 v9, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v11, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v12, 4.0
+; VI-NEXT:    v_mov_b32_e32 v13, 4.0
+; VI-NEXT:    v_mov_b32_e32 v14, 4.0
+; VI-NEXT:    v_mov_b32_e32 v15, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v17, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v18, 0x40c00000
+; VI-NEXT:    v_mov_b32_e32 v19, 0x40c00000
+; VI-NEXT:    v_mov_b32_e32 v20, 0x40c00000
+; VI-NEXT:    v_mov_b32_e32 v21, 0x40e00000
+; VI-NEXT:    v_mov_b32_e32 v22, 0x40e00000
+; VI-NEXT:    v_mov_b32_e32 v23, 0x40e00000
+; VI-NEXT:    v_mov_b32_e32 v24, 0x41000000
+; VI-NEXT:    v_mov_b32_e32 v25, 0x41000000
+; VI-NEXT:    v_mov_b32_e32 v26, 0x41000000
+; VI-NEXT:    v_mov_b32_e32 v27, 0x41100000
+; VI-NEXT:    v_mov_b32_e32 v28, 0x41100000
+; VI-NEXT:    v_mov_b32_e32 v29, 0x41100000
+; VI-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
+; VI-NEXT:    v_writelane_b32 v40, s31, 1
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
+; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
+; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    s_mov_b64 exec, s[6:7]
+; VI-NEXT:    s_addk_i32 s32, 0xfc00
+; VI-NEXT:    s_mov_b32 s33, s4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: stack_12xv3f32:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s33
+; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CI-NEXT:    s_mov_b64 exec, s[8:9]
+; CI-NEXT:    s_addk_i32 s32, 0x400
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; CI-NEXT:    v_writelane_b32 v40, s30, 0
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_mov_b32_e32 v3, 1.0
+; CI-NEXT:    v_mov_b32_e32 v4, 1.0
+; CI-NEXT:    v_mov_b32_e32 v5, 1.0
+; CI-NEXT:    v_mov_b32_e32 v6, 2.0
+; CI-NEXT:    v_mov_b32_e32 v7, 2.0
+; CI-NEXT:    v_mov_b32_e32 v8, 2.0
+; CI-NEXT:    v_mov_b32_e32 v9, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v11, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v12, 4.0
+; CI-NEXT:    v_mov_b32_e32 v13, 4.0
+; CI-NEXT:    v_mov_b32_e32 v14, 4.0
+; CI-NEXT:    v_mov_b32_e32 v15, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v16, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v17, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v18, 0x40c00000
+; CI-NEXT:    v_mov_b32_e32 v19, 0x40c00000
+; CI-NEXT:    v_mov_b32_e32 v20, 0x40c00000
+; CI-NEXT:    v_mov_b32_e32 v21, 0x40e00000
+; CI-NEXT:    v_mov_b32_e32 v22, 0x40e00000
+; CI-NEXT:    v_mov_b32_e32 v23, 0x40e00000
+; CI-NEXT:    v_mov_b32_e32 v24, 0x41000000
+; CI-NEXT:    v_mov_b32_e32 v25, 0x41000000
+; CI-NEXT:    v_mov_b32_e32 v26, 0x41000000
+; CI-NEXT:    v_mov_b32_e32 v27, 0x41100000
+; CI-NEXT:    v_mov_b32_e32 v28, 0x41100000
+; CI-NEXT:    v_mov_b32_e32 v29, 0x41100000
+; CI-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
+; CI-NEXT:    v_writelane_b32 v40, s31, 1
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
+; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
+; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CI-NEXT:    s_mov_b64 exec, s[6:7]
+; CI-NEXT:    s_addk_i32 s32, 0xfc00
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: stack_12xv3f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v11, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v12, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x40c00000
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0x40c00000
+; GFX9-NEXT:    v_mov_b32_e32 v20, 0x40c00000
+; GFX9-NEXT:    v_mov_b32_e32 v21, 0x40e00000
+; GFX9-NEXT:    v_mov_b32_e32 v22, 0x40e00000
+; GFX9-NEXT:    v_mov_b32_e32 v23, 0x40e00000
+; GFX9-NEXT:    v_mov_b32_e32 v24, 0x41000000
+; GFX9-NEXT:    v_mov_b32_e32 v25, 0x41000000
+; GFX9-NEXT:    v_mov_b32_e32 v26, 0x41000000
+; GFX9-NEXT:    v_mov_b32_e32 v27, 0x41100000
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0x41100000
+; GFX9-NEXT:    v_mov_b32_e32 v29, 0x41100000
+; GFX9-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_12xv3f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41400000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41600000
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    scratch_store_b32 off, v4, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 1.0
+; GFX11-NEXT:    v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v6, 2.0 :: v_dual_mov_b32 v9, 0x40400000
+; GFX11-NEXT:    v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v11, 0x40400000
+; GFX11-NEXT:    v_dual_mov_b32 v10, 0x40400000 :: v_dual_mov_b32 v13, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v12, 4.0 :: v_dual_mov_b32 v15, 0x40a00000
+; GFX11-NEXT:    v_dual_mov_b32 v14, 4.0 :: v_dual_mov_b32 v17, 0x40a00000
+; GFX11-NEXT:    v_mov_b32_e32 v16, 0x40a00000
+; GFX11-NEXT:    v_dual_mov_b32 v18, 0x40c00000 :: v_dual_mov_b32 v19, 0x40c00000
+; GFX11-NEXT:    v_mov_b32_e32 v20, 0x40c00000
+; GFX11-NEXT:    v_dual_mov_b32 v21, 0x40e00000 :: v_dual_mov_b32 v22, 0x40e00000
+; GFX11-NEXT:    v_mov_b32_e32 v23, 0x40e00000
+; GFX11-NEXT:    v_dual_mov_b32 v24, 0x41000000 :: v_dual_mov_b32 v25, 0x41000000
+; GFX11-NEXT:    v_mov_b32_e32 v26, 0x41000000
+; GFX11-NEXT:    v_dual_mov_b32 v27, 0x41100000 :: v_dual_mov_b32 v28, 0x41100000
+; GFX11-NEXT:    v_mov_b32_e32 v29, 0x41100000
+; GFX11-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_12xv3f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_12xv3f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; HSA-LABEL: stack_12xv3f32:
+; HSA:       ; %bb.0: ; %entry
+; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HSA-NEXT:    s_mov_b32 s4, s33
+; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; HSA-NEXT:    s_mov_b64 exec, s[8:9]
+; HSA-NEXT:    s_addk_i32 s32, 0x400
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; HSA-NEXT:    v_writelane_b32 v40, s30, 0
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; HSA-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NEXT:    v_mov_b32_e32 v1, 0
+; HSA-NEXT:    v_mov_b32_e32 v2, 0
+; HSA-NEXT:    v_mov_b32_e32 v3, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v4, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v5, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v6, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v7, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v8, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v9, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v11, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v12, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v13, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v14, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v15, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v16, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v17, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v18, 0x40c00000
+; HSA-NEXT:    v_mov_b32_e32 v19, 0x40c00000
+; HSA-NEXT:    v_mov_b32_e32 v20, 0x40c00000
+; HSA-NEXT:    v_mov_b32_e32 v21, 0x40e00000
+; HSA-NEXT:    v_mov_b32_e32 v22, 0x40e00000
+; HSA-NEXT:    v_mov_b32_e32 v23, 0x40e00000
+; HSA-NEXT:    v_mov_b32_e32 v24, 0x41000000
+; HSA-NEXT:    v_mov_b32_e32 v25, 0x41000000
+; HSA-NEXT:    v_mov_b32_e32 v26, 0x41000000
+; HSA-NEXT:    v_mov_b32_e32 v27, 0x41100000
+; HSA-NEXT:    v_mov_b32_e32 v28, 0x41100000
+; HSA-NEXT:    v_mov_b32_e32 v29, 0x41100000
+; HSA-NEXT:    v_mov_b32_e32 v30, 0x41200000
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
+; HSA-NEXT:    v_writelane_b32 v40, s31, 1
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_12xv3f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_12xv3f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
+; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
+; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; HSA-NEXT:    s_mov_b64 exec, s[6:7]
+; HSA-NEXT:    s_addk_i32 s32, 0xfc00
+; HSA-NEXT:    s_mov_b32 s33, s4
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call void @external_void_func_12xv3f32(
       <3 x float><float 0.0, float 0.0, float 0.0>,
@@ -834,27 +6526,378 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}stack_8xv5i32:
-; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 7
-; GCN: buffer_store_dword [[REG7]], {{.*$}}
-; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 8
-; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
-; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 9
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
-; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 10
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
-; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 11
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
-; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 12
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 13
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 14
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 15
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
-; GCN: s_getpc
 define void @stack_8xv5i32() #0 {
+; VI-LABEL: stack_8xv5i32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s33
+; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    s_mov_b64 exec, s[8:9]
+; VI-NEXT:    s_addk_i32 s32, 0x400
+; VI-NEXT:    v_mov_b32_e32 v0, 7
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; VI-NEXT:    v_mov_b32_e32 v0, 8
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_mov_b32_e32 v0, 9
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; VI-NEXT:    v_mov_b32_e32 v0, 10
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; VI-NEXT:    v_mov_b32_e32 v0, 11
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, 12
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; VI-NEXT:    v_mov_b32_e32 v0, 13
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; VI-NEXT:    v_mov_b32_e32 v0, 14
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; VI-NEXT:    v_mov_b32_e32 v0, 15
+; VI-NEXT:    v_writelane_b32 v40, s30, 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, 0
+; VI-NEXT:    v_mov_b32_e32 v5, 1
+; VI-NEXT:    v_mov_b32_e32 v6, 1
+; VI-NEXT:    v_mov_b32_e32 v7, 1
+; VI-NEXT:    v_mov_b32_e32 v8, 1
+; VI-NEXT:    v_mov_b32_e32 v9, 1
+; VI-NEXT:    v_mov_b32_e32 v10, 2
+; VI-NEXT:    v_mov_b32_e32 v11, 2
+; VI-NEXT:    v_mov_b32_e32 v12, 2
+; VI-NEXT:    v_mov_b32_e32 v13, 2
+; VI-NEXT:    v_mov_b32_e32 v14, 2
+; VI-NEXT:    v_mov_b32_e32 v15, 3
+; VI-NEXT:    v_mov_b32_e32 v16, 3
+; VI-NEXT:    v_mov_b32_e32 v17, 3
+; VI-NEXT:    v_mov_b32_e32 v18, 3
+; VI-NEXT:    v_mov_b32_e32 v19, 3
+; VI-NEXT:    v_mov_b32_e32 v20, 4
+; VI-NEXT:    v_mov_b32_e32 v21, 4
+; VI-NEXT:    v_mov_b32_e32 v22, 4
+; VI-NEXT:    v_mov_b32_e32 v23, 4
+; VI-NEXT:    v_mov_b32_e32 v24, 4
+; VI-NEXT:    v_mov_b32_e32 v25, 5
+; VI-NEXT:    v_mov_b32_e32 v26, 5
+; VI-NEXT:    v_mov_b32_e32 v27, 5
+; VI-NEXT:    v_mov_b32_e32 v28, 5
+; VI-NEXT:    v_mov_b32_e32 v29, 5
+; VI-NEXT:    v_mov_b32_e32 v30, 6
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
+; VI-NEXT:    v_writelane_b32 v40, s31, 1
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
+; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
+; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    s_mov_b64 exec, s[6:7]
+; VI-NEXT:    s_addk_i32 s32, 0xfc00
+; VI-NEXT:    s_mov_b32 s33, s4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: stack_8xv5i32:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s33
+; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CI-NEXT:    s_mov_b64 exec, s[8:9]
+; CI-NEXT:    s_addk_i32 s32, 0x400
+; CI-NEXT:    v_mov_b32_e32 v0, 7
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; CI-NEXT:    v_mov_b32_e32 v0, 8
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_mov_b32_e32 v0, 9
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; CI-NEXT:    v_mov_b32_e32 v0, 10
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; CI-NEXT:    v_mov_b32_e32 v0, 11
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; CI-NEXT:    v_mov_b32_e32 v0, 12
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; CI-NEXT:    v_mov_b32_e32 v0, 13
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; CI-NEXT:    v_mov_b32_e32 v0, 14
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; CI-NEXT:    v_mov_b32_e32 v0, 15
+; CI-NEXT:    v_writelane_b32 v40, s30, 0
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v4, 0
+; CI-NEXT:    v_mov_b32_e32 v5, 1
+; CI-NEXT:    v_mov_b32_e32 v6, 1
+; CI-NEXT:    v_mov_b32_e32 v7, 1
+; CI-NEXT:    v_mov_b32_e32 v8, 1
+; CI-NEXT:    v_mov_b32_e32 v9, 1
+; CI-NEXT:    v_mov_b32_e32 v10, 2
+; CI-NEXT:    v_mov_b32_e32 v11, 2
+; CI-NEXT:    v_mov_b32_e32 v12, 2
+; CI-NEXT:    v_mov_b32_e32 v13, 2
+; CI-NEXT:    v_mov_b32_e32 v14, 2
+; CI-NEXT:    v_mov_b32_e32 v15, 3
+; CI-NEXT:    v_mov_b32_e32 v16, 3
+; CI-NEXT:    v_mov_b32_e32 v17, 3
+; CI-NEXT:    v_mov_b32_e32 v18, 3
+; CI-NEXT:    v_mov_b32_e32 v19, 3
+; CI-NEXT:    v_mov_b32_e32 v20, 4
+; CI-NEXT:    v_mov_b32_e32 v21, 4
+; CI-NEXT:    v_mov_b32_e32 v22, 4
+; CI-NEXT:    v_mov_b32_e32 v23, 4
+; CI-NEXT:    v_mov_b32_e32 v24, 4
+; CI-NEXT:    v_mov_b32_e32 v25, 5
+; CI-NEXT:    v_mov_b32_e32 v26, 5
+; CI-NEXT:    v_mov_b32_e32 v27, 5
+; CI-NEXT:    v_mov_b32_e32 v28, 5
+; CI-NEXT:    v_mov_b32_e32 v29, 5
+; CI-NEXT:    v_mov_b32_e32 v30, 6
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
+; CI-NEXT:    v_writelane_b32 v40, s31, 1
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
+; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
+; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CI-NEXT:    s_mov_b64 exec, s[6:7]
+; CI-NEXT:    s_addk_i32 s32, 0xfc00
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: stack_8xv5i32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 7
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 8
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 9
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 10
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, 11
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 12
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 14
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1
+; GFX9-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-NEXT:    v_mov_b32_e32 v7, 1
+; GFX9-NEXT:    v_mov_b32_e32 v8, 1
+; GFX9-NEXT:    v_mov_b32_e32 v9, 1
+; GFX9-NEXT:    v_mov_b32_e32 v10, 2
+; GFX9-NEXT:    v_mov_b32_e32 v11, 2
+; GFX9-NEXT:    v_mov_b32_e32 v12, 2
+; GFX9-NEXT:    v_mov_b32_e32 v13, 2
+; GFX9-NEXT:    v_mov_b32_e32 v14, 2
+; GFX9-NEXT:    v_mov_b32_e32 v15, 3
+; GFX9-NEXT:    v_mov_b32_e32 v16, 3
+; GFX9-NEXT:    v_mov_b32_e32 v17, 3
+; GFX9-NEXT:    v_mov_b32_e32 v18, 3
+; GFX9-NEXT:    v_mov_b32_e32 v19, 3
+; GFX9-NEXT:    v_mov_b32_e32 v20, 4
+; GFX9-NEXT:    v_mov_b32_e32 v21, 4
+; GFX9-NEXT:    v_mov_b32_e32 v22, 4
+; GFX9-NEXT:    v_mov_b32_e32 v23, 4
+; GFX9-NEXT:    v_mov_b32_e32 v24, 4
+; GFX9-NEXT:    v_mov_b32_e32 v25, 5
+; GFX9-NEXT:    v_mov_b32_e32 v26, 5
+; GFX9-NEXT:    v_mov_b32_e32 v27, 5
+; GFX9-NEXT:    v_mov_b32_e32 v28, 5
+; GFX9-NEXT:    v_mov_b32_e32 v29, 5
+; GFX9-NEXT:    v_mov_b32_e32 v30, 6
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_8xv5i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8
+; GFX11-NEXT:    v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10
+; GFX11-NEXT:    v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14
+; GFX11-NEXT:    v_mov_b32_e32 v6, 13
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 32
+; GFX11-NEXT:    s_add_i32 s1, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    scratch_store_b32 off, v8, s0
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, 1
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v7, 1
+; GFX11-NEXT:    v_dual_mov_b32 v6, 1 :: v_dual_mov_b32 v9, 1
+; GFX11-NEXT:    v_dual_mov_b32 v8, 1 :: v_dual_mov_b32 v11, 2
+; GFX11-NEXT:    v_dual_mov_b32 v10, 2 :: v_dual_mov_b32 v13, 2
+; GFX11-NEXT:    v_dual_mov_b32 v12, 2 :: v_dual_mov_b32 v15, 3
+; GFX11-NEXT:    v_dual_mov_b32 v14, 2 :: v_dual_mov_b32 v17, 3
+; GFX11-NEXT:    v_dual_mov_b32 v16, 3 :: v_dual_mov_b32 v19, 3
+; GFX11-NEXT:    v_dual_mov_b32 v18, 3 :: v_dual_mov_b32 v21, 4
+; GFX11-NEXT:    v_dual_mov_b32 v20, 4 :: v_dual_mov_b32 v23, 4
+; GFX11-NEXT:    v_dual_mov_b32 v22, 4 :: v_dual_mov_b32 v25, 5
+; GFX11-NEXT:    v_dual_mov_b32 v24, 4 :: v_dual_mov_b32 v27, 5
+; GFX11-NEXT:    v_dual_mov_b32 v26, 5 :: v_dual_mov_b32 v29, 5
+; GFX11-NEXT:    v_mov_b32_e32 v28, 5
+; GFX11-NEXT:    v_mov_b32_e32 v30, 6
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5i32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5i32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; HSA-LABEL: stack_8xv5i32:
+; HSA:       ; %bb.0: ; %entry
+; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HSA-NEXT:    s_mov_b32 s4, s33
+; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; HSA-NEXT:    s_mov_b64 exec, s[8:9]
+; HSA-NEXT:    s_addk_i32 s32, 0x400
+; HSA-NEXT:    v_mov_b32_e32 v0, 7
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; HSA-NEXT:    v_mov_b32_e32 v0, 8
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; HSA-NEXT:    v_mov_b32_e32 v0, 9
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; HSA-NEXT:    v_mov_b32_e32 v0, 10
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; HSA-NEXT:    v_mov_b32_e32 v0, 11
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; HSA-NEXT:    v_mov_b32_e32 v0, 12
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; HSA-NEXT:    v_mov_b32_e32 v0, 13
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; HSA-NEXT:    v_mov_b32_e32 v0, 14
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; HSA-NEXT:    v_mov_b32_e32 v0, 15
+; HSA-NEXT:    v_writelane_b32 v40, s30, 0
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; HSA-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NEXT:    v_mov_b32_e32 v1, 0
+; HSA-NEXT:    v_mov_b32_e32 v2, 0
+; HSA-NEXT:    v_mov_b32_e32 v3, 0
+; HSA-NEXT:    v_mov_b32_e32 v4, 0
+; HSA-NEXT:    v_mov_b32_e32 v5, 1
+; HSA-NEXT:    v_mov_b32_e32 v6, 1
+; HSA-NEXT:    v_mov_b32_e32 v7, 1
+; HSA-NEXT:    v_mov_b32_e32 v8, 1
+; HSA-NEXT:    v_mov_b32_e32 v9, 1
+; HSA-NEXT:    v_mov_b32_e32 v10, 2
+; HSA-NEXT:    v_mov_b32_e32 v11, 2
+; HSA-NEXT:    v_mov_b32_e32 v12, 2
+; HSA-NEXT:    v_mov_b32_e32 v13, 2
+; HSA-NEXT:    v_mov_b32_e32 v14, 2
+; HSA-NEXT:    v_mov_b32_e32 v15, 3
+; HSA-NEXT:    v_mov_b32_e32 v16, 3
+; HSA-NEXT:    v_mov_b32_e32 v17, 3
+; HSA-NEXT:    v_mov_b32_e32 v18, 3
+; HSA-NEXT:    v_mov_b32_e32 v19, 3
+; HSA-NEXT:    v_mov_b32_e32 v20, 4
+; HSA-NEXT:    v_mov_b32_e32 v21, 4
+; HSA-NEXT:    v_mov_b32_e32 v22, 4
+; HSA-NEXT:    v_mov_b32_e32 v23, 4
+; HSA-NEXT:    v_mov_b32_e32 v24, 4
+; HSA-NEXT:    v_mov_b32_e32 v25, 5
+; HSA-NEXT:    v_mov_b32_e32 v26, 5
+; HSA-NEXT:    v_mov_b32_e32 v27, 5
+; HSA-NEXT:    v_mov_b32_e32 v28, 5
+; HSA-NEXT:    v_mov_b32_e32 v29, 5
+; HSA-NEXT:    v_mov_b32_e32 v30, 6
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
+; HSA-NEXT:    v_writelane_b32 v40, s31, 1
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5i32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5i32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
+; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
+; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; HSA-NEXT:    s_mov_b64 exec, s[6:7]
+; HSA-NEXT:    s_addk_i32 s32, 0xfc00
+; HSA-NEXT:    s_mov_b32 s33, s4
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call void @external_void_func_8xv5i32(
       <5 x i32><i32 0, i32 0, i32 0, i32 0, i32 0>,
@@ -868,27 +6911,381 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}stack_8xv5f32:
-; GCN: v_mov_b32_e32 [[REG7:v[0-9]+]], 0x40e00000
-; GCN: buffer_store_dword [[REG7]], {{.*$}}
-; GCN: v_mov_b32_e32 [[REG8:v[0-9]+]], 0x41000000
-; GCN: buffer_store_dword [[REG8]], {{.*}} offset:4
-; GCN: v_mov_b32_e32 [[REG9:v[0-9]+]], 0x41100000
-; GCN: buffer_store_dword [[REG9]], {{.*}} offset:8
-; GCN: v_mov_b32_e32 [[REG10:v[0-9]+]], 0x41200000
-; GCN: buffer_store_dword [[REG10]], {{.*}} offset:12
-; GCN: v_mov_b32_e32 [[REG11:v[0-9]+]], 0x41300000
-; GCN: buffer_store_dword [[REG11]], {{.*}} offset:16
-; GCN: v_mov_b32_e32 [[REG12:v[0-9]+]], 0x41400000
-; GCN: buffer_store_dword [[REG12]], {{.*}} offset:20
-; GCN: v_mov_b32_e32 [[REG13:v[0-9]+]], 0x41500000
-; GCN: buffer_store_dword [[REG13]], {{.*}} offset:24
-; GCN: v_mov_b32_e32 [[REG14:v[0-9]+]], 0x41600000
-; GCN: buffer_store_dword [[REG14]], {{.*}} offset:28
-; GCN: v_mov_b32_e32 [[REG15:v[0-9]+]], 0x41700000
-; GCN: buffer_store_dword [[REG15]], {{.*}} offset:32
-; GCN: s_getpc
 define void @stack_8xv5f32() #0 {
+; VI-LABEL: stack_8xv5f32:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s33
+; VI-NEXT:    s_mov_b32 s33, s32
+; VI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; VI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; VI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; VI-NEXT:    s_mov_b64 exec, s[8:9]
+; VI-NEXT:    s_addk_i32 s32, 0x400
+; VI-NEXT:    v_mov_b32_e32 v0, 0x40e00000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41100000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; VI-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; VI-NEXT:    v_writelane_b32 v40, s30, 0
+; VI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    v_mov_b32_e32 v3, 0
+; VI-NEXT:    v_mov_b32_e32 v4, 0
+; VI-NEXT:    v_mov_b32_e32 v5, 1.0
+; VI-NEXT:    v_mov_b32_e32 v6, 1.0
+; VI-NEXT:    v_mov_b32_e32 v7, 1.0
+; VI-NEXT:    v_mov_b32_e32 v8, 1.0
+; VI-NEXT:    v_mov_b32_e32 v9, 1.0
+; VI-NEXT:    v_mov_b32_e32 v10, 2.0
+; VI-NEXT:    v_mov_b32_e32 v11, 2.0
+; VI-NEXT:    v_mov_b32_e32 v12, 2.0
+; VI-NEXT:    v_mov_b32_e32 v13, 2.0
+; VI-NEXT:    v_mov_b32_e32 v14, 2.0
+; VI-NEXT:    v_mov_b32_e32 v15, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v16, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v17, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v19, 0x40400000
+; VI-NEXT:    v_mov_b32_e32 v20, 4.0
+; VI-NEXT:    v_mov_b32_e32 v21, 4.0
+; VI-NEXT:    v_mov_b32_e32 v22, 4.0
+; VI-NEXT:    v_mov_b32_e32 v23, 4.0
+; VI-NEXT:    v_mov_b32_e32 v24, 4.0
+; VI-NEXT:    v_mov_b32_e32 v25, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v26, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v27, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v28, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v29, 0x40a00000
+; VI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; VI-NEXT:    v_writelane_b32 v41, s4, 0
+; VI-NEXT:    v_writelane_b32 v40, s31, 1
+; VI-NEXT:    s_getpc_b64 s[4:5]
+; VI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; VI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    v_readlane_b32 s31, v40, 1
+; VI-NEXT:    v_readlane_b32 s30, v40, 0
+; VI-NEXT:    v_readlane_b32 s4, v41, 0
+; VI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; VI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; VI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; VI-NEXT:    s_mov_b64 exec, s[6:7]
+; VI-NEXT:    s_addk_i32 s32, 0xfc00
+; VI-NEXT:    s_mov_b32 s33, s4
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: stack_8xv5f32:
+; CI:       ; %bb.0: ; %entry
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s4, s33
+; CI-NEXT:    s_mov_b32 s33, s32
+; CI-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; CI-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CI-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CI-NEXT:    s_mov_b64 exec, s[8:9]
+; CI-NEXT:    s_addk_i32 s32, 0x400
+; CI-NEXT:    v_mov_b32_e32 v0, 0x40e00000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41100000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; CI-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; CI-NEXT:    v_writelane_b32 v40, s30, 0
+; CI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    v_mov_b32_e32 v2, 0
+; CI-NEXT:    v_mov_b32_e32 v3, 0
+; CI-NEXT:    v_mov_b32_e32 v4, 0
+; CI-NEXT:    v_mov_b32_e32 v5, 1.0
+; CI-NEXT:    v_mov_b32_e32 v6, 1.0
+; CI-NEXT:    v_mov_b32_e32 v7, 1.0
+; CI-NEXT:    v_mov_b32_e32 v8, 1.0
+; CI-NEXT:    v_mov_b32_e32 v9, 1.0
+; CI-NEXT:    v_mov_b32_e32 v10, 2.0
+; CI-NEXT:    v_mov_b32_e32 v11, 2.0
+; CI-NEXT:    v_mov_b32_e32 v12, 2.0
+; CI-NEXT:    v_mov_b32_e32 v13, 2.0
+; CI-NEXT:    v_mov_b32_e32 v14, 2.0
+; CI-NEXT:    v_mov_b32_e32 v15, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v16, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v17, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v19, 0x40400000
+; CI-NEXT:    v_mov_b32_e32 v20, 4.0
+; CI-NEXT:    v_mov_b32_e32 v21, 4.0
+; CI-NEXT:    v_mov_b32_e32 v22, 4.0
+; CI-NEXT:    v_mov_b32_e32 v23, 4.0
+; CI-NEXT:    v_mov_b32_e32 v24, 4.0
+; CI-NEXT:    v_mov_b32_e32 v25, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v26, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v27, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v28, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v29, 0x40a00000
+; CI-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; CI-NEXT:    v_writelane_b32 v41, s4, 0
+; CI-NEXT:    v_writelane_b32 v40, s31, 1
+; CI-NEXT:    s_getpc_b64 s[4:5]
+; CI-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; CI-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
+; CI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; CI-NEXT:    v_readlane_b32 s31, v40, 1
+; CI-NEXT:    v_readlane_b32 s30, v40, 0
+; CI-NEXT:    v_readlane_b32 s4, v41, 0
+; CI-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; CI-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; CI-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CI-NEXT:    s_mov_b64 exec, s[6:7]
+; CI-NEXT:    s_addk_i32 s32, 0xfc00
+; CI-NEXT:    s_mov_b32 s33, s4
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: stack_8xv5f32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s4, s33
+; GFX9-NEXT:    s_mov_b32 s33, s32
+; GFX9-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_mov_b64 exec, s[8:9]
+; GFX9-NEXT:    s_addk_i32 s32, 0x400
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x40e00000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41100000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v8, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v9, 1.0
+; GFX9-NEXT:    v_mov_b32_e32 v10, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v11, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v12, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v13, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v14, 2.0
+; GFX9-NEXT:    v_mov_b32_e32 v15, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v16, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v17, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v19, 0x40400000
+; GFX9-NEXT:    v_mov_b32_e32 v20, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v21, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v22, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v23, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v24, 4.0
+; GFX9-NEXT:    v_mov_b32_e32 v25, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v26, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v27, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v28, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v29, 0x40a00000
+; GFX9-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
+; GFX9-NEXT:    s_mov_b32 s33, s4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: stack_8xv5f32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s0, s33
+; GFX11-NEXT:    s_mov_b32 s33, s32
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0x40e00000
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x41000000
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0x41100000
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0x41200000
+; GFX11-NEXT:    v_mov_b32_e32 v8, 0x41700000
+; GFX11-NEXT:    s_add_i32 s32, s32, 16
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0x41300000
+; GFX11-NEXT:    v_mov_b32_e32 v5, 0x41400000
+; GFX11-NEXT:    v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0x41600000
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 0
+; GFX11-NEXT:    s_add_i32 s0, s32, 32
+; GFX11-NEXT:    s_add_i32 s1, s32, 16
+; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    scratch_store_b32 off, v8, s0
+; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s1
+; GFX11-NEXT:    v_mov_b32_e32 v6, 1.0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 1.0
+; GFX11-NEXT:    v_dual_mov_b32 v7, 1.0 :: v_dual_mov_b32 v8, 1.0
+; GFX11-NEXT:    v_dual_mov_b32 v11, 2.0 :: v_dual_mov_b32 v10, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v13, 2.0 :: v_dual_mov_b32 v12, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v15, 0x40400000 :: v_dual_mov_b32 v14, 2.0
+; GFX11-NEXT:    v_dual_mov_b32 v17, 0x40400000 :: v_dual_mov_b32 v16, 0x40400000
+; GFX11-NEXT:    v_dual_mov_b32 v19, 0x40400000 :: v_dual_mov_b32 v18, 0x40400000
+; GFX11-NEXT:    v_dual_mov_b32 v21, 4.0 :: v_dual_mov_b32 v20, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v23, 4.0 :: v_dual_mov_b32 v22, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v25, 0x40a00000 :: v_dual_mov_b32 v24, 4.0
+; GFX11-NEXT:    v_dual_mov_b32 v27, 0x40a00000 :: v_dual_mov_b32 v26, 0x40a00000
+; GFX11-NEXT:    v_dual_mov_b32 v29, 0x40a00000 :: v_dual_mov_b32 v28, 0x40a00000
+; GFX11-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, external_void_func_8xv5f32 at rel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, external_void_func_8xv5f32 at rel32@hi+12
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 0
+; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    s_mov_b32 exec_lo, s1
+; GFX11-NEXT:    s_add_i32 s32, s32, -16
+; GFX11-NEXT:    s_mov_b32 s33, s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; HSA-LABEL: stack_8xv5f32:
+; HSA:       ; %bb.0: ; %entry
+; HSA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; HSA-NEXT:    s_mov_b32 s4, s33
+; HSA-NEXT:    s_mov_b32 s33, s32
+; HSA-NEXT:    s_or_saveexec_b64 s[8:9], -1
+; HSA-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; HSA-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; HSA-NEXT:    s_mov_b64 exec, s[8:9]
+; HSA-NEXT:    s_addk_i32 s32, 0x400
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x40e00000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41000000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41100000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41200000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:12
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41300000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:16
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41400000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:20
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41500000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41600000
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28
+; HSA-NEXT:    v_mov_b32_e32 v0, 0x41700000
+; HSA-NEXT:    v_writelane_b32 v40, s30, 0
+; HSA-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; HSA-NEXT:    v_mov_b32_e32 v0, 0
+; HSA-NEXT:    v_mov_b32_e32 v1, 0
+; HSA-NEXT:    v_mov_b32_e32 v2, 0
+; HSA-NEXT:    v_mov_b32_e32 v3, 0
+; HSA-NEXT:    v_mov_b32_e32 v4, 0
+; HSA-NEXT:    v_mov_b32_e32 v5, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v6, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v7, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v8, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v9, 1.0
+; HSA-NEXT:    v_mov_b32_e32 v10, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v11, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v12, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v13, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v14, 2.0
+; HSA-NEXT:    v_mov_b32_e32 v15, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v16, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v17, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v18, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v19, 0x40400000
+; HSA-NEXT:    v_mov_b32_e32 v20, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v21, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v22, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v23, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v24, 4.0
+; HSA-NEXT:    v_mov_b32_e32 v25, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v26, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v27, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v28, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v29, 0x40a00000
+; HSA-NEXT:    v_mov_b32_e32 v30, 0x40c00000
+; HSA-NEXT:    v_writelane_b32 v41, s4, 0
+; HSA-NEXT:    v_writelane_b32 v40, s31, 1
+; HSA-NEXT:    s_getpc_b64 s[4:5]
+; HSA-NEXT:    s_add_u32 s4, s4, external_void_func_8xv5f32 at rel32@lo+4
+; HSA-NEXT:    s_addc_u32 s5, s5, external_void_func_8xv5f32 at rel32@hi+12
+; HSA-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; HSA-NEXT:    v_readlane_b32 s31, v40, 1
+; HSA-NEXT:    v_readlane_b32 s30, v40, 0
+; HSA-NEXT:    v_readlane_b32 s4, v41, 0
+; HSA-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; HSA-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; HSA-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; HSA-NEXT:    s_mov_b64 exec, s[6:7]
+; HSA-NEXT:    s_addk_i32 s32, 0xfc00
+; HSA-NEXT:    s_mov_b32 s33, s4
+; HSA-NEXT:    s_waitcnt vmcnt(0)
+; HSA-NEXT:    s_setpc_b64 s[30:31]
 entry:
   call void @external_void_func_8xv5f32(
       <5 x float><float 0.0, float 0.0, float 0.0, float 0.0, float 0.0>,
@@ -912,9 +7309,7 @@ declare hidden void @external_void_func_12xv3f32(<3 x float>, <3 x float>, <3 x
     <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>, <3 x float>) #0
 declare hidden void @external_void_func_8xv5f32(<5 x float>, <5 x float>, <5 x float>, <5 x float>,
     <5 x float>, <5 x float>, <5 x float>, <5 x float>) #0
+
 attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind noinline }
-
-
-

diff  --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 85c76b5146e4b..e90d8cd3ae39c 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1,11 +1,39 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SIVI,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11 %s
 
 ; Make sure we don't crash or assert on spir_kernel calling convention.
 
-; GCN-LABEL: {{^}}kernel:
-; GCN: s_endpgm
 define spir_kernel void @kernel(ptr addrspace(1) %out) {
+; SI-LABEL: kernel:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: kernel:
+; VI:       ; %bb.0: ; %entry
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    v_mov_b32_e32 v2, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    flat_store_dword v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: kernel:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 entry:
   store i32 0, ptr addrspace(1) %out
   ret void
@@ -20,324 +48,909 @@ entry:
 ;   ret void
 ; }
 
-; GCN-LABEL: {{^}}ps_ret_cc_f16:
-; SI: v_cvt_f16_f32_e32 v0, v0
-; SI: v_cvt_f32_f16_e32 v0, v0
-; SI: v_add_f32_e32 v0, 1.0, v0
-
-; VI: v_add_f16_e32 v0, 1.0, v0
-; VI: ; return
 define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
+; SI-LABEL: ps_ret_cc_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ps_ret_cc_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ps_ret_cc_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
-; GCN-LABEL: {{^}}ps_ret_cc_inreg_f16:
-; SI: v_cvt_f16_f32_e32 v0, s0
-; SI: v_cvt_f32_f16_e32 v0, v0
-; SI: v_add_f32_e32 v0, 1.0, v0
-
-; VI: v_add_f16_e64 v0, s0, 1.0
-; VI: ; return
 define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
+; SI-LABEL: ps_ret_cc_inreg_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ps_ret_cc_inreg_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e64 v0, s0, 1.0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ps_ret_cc_inreg_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e64 v0, s0, 1.0
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
-; GCN-LABEL: {{^}}fastcc:
-; GCN: v_add_f32_e32 v0, 4.0, v0
 define fastcc float @fastcc(float %arg0) #0 {
+; SIVI-LABEL: fastcc:
+; SIVI:       ; %bb.0:
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; SIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fastcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %add = fadd float %arg0, 4.0
   ret float %add
 }
 
-; GCN-LABEL: {{^}}coldcc:
-; GCN: v_add_f32_e32 v0, 4.0, v0
 define coldcc float @coldcc(float %arg0) #0 {
+; SIVI-LABEL: coldcc:
+; SIVI:       ; %bb.0:
+; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; SIVI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: coldcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
  %add = fadd float %arg0, 4.0
  ret float %add
 }
 
-; GCN-LABEL: {{^}}call_coldcc:
-; GCN: v_mov_b32_e32 v0, 1.0
-; GCN: s_swappc_b64
 define amdgpu_kernel void @call_coldcc() #0 {
+; SI-LABEL: call_coldcc:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s32, 0
+; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s11, 0xe8f000
+; SI-NEXT:    s_add_u32 s8, s8, s1
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_getpc_b64 s[0:1]
+; SI-NEXT:    s_add_u32 s0, s0, coldcc at gotpcrel32@lo+4
+; SI-NEXT:    s_addc_u32 s1, s1, coldcc at gotpcrel32@hi+12
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SI-NEXT:    v_mov_b32_e32 v0, 1.0
+; SI-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: call_coldcc:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s90, -1
+; VI-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-NEXT:    s_add_u32 s88, s88, s1
+; VI-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NEXT:    s_getpc_b64 s[0:1]
+; VI-NEXT:    s_add_u32 s0, s0, coldcc at gotpcrel32@lo+4
+; VI-NEXT:    s_addc_u32 s1, s1, coldcc at gotpcrel32@hi+12
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; VI-NEXT:    s_mov_b64 s[0:1], s[88:89]
+; VI-NEXT:    s_mov_b64 s[2:3], s[90:91]
+; VI-NEXT:    v_mov_b32_e32 v0, 1.0
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    flat_store_dword v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: call_coldcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, coldcc at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, coldcc at gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %val = call float @coldcc(float 1.0)
   store float %val, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}call_fastcc:
-; GCN: v_mov_b32_e32 v0, 1.0
-; GCN: s_swappc_b64
 define amdgpu_kernel void @call_fastcc() #0 {
+; SI-LABEL: call_fastcc:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_mov_b32 s32, 0
+; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s11, 0xe8f000
+; SI-NEXT:    s_add_u32 s8, s8, s1
+; SI-NEXT:    s_addc_u32 s9, s9, 0
+; SI-NEXT:    s_getpc_b64 s[0:1]
+; SI-NEXT:    s_add_u32 s0, s0, fastcc at gotpcrel32@lo+4
+; SI-NEXT:    s_addc_u32 s1, s1, fastcc at gotpcrel32@hi+12
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SI-NEXT:    v_mov_b32_e32 v0, 1.0
+; SI-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; SI-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: call_fastcc:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
+; VI-NEXT:    s_mov_b32 s90, -1
+; VI-NEXT:    s_mov_b32 s91, 0xe80000
+; VI-NEXT:    s_add_u32 s88, s88, s1
+; VI-NEXT:    s_addc_u32 s89, s89, 0
+; VI-NEXT:    s_getpc_b64 s[0:1]
+; VI-NEXT:    s_add_u32 s0, s0, fastcc at gotpcrel32@lo+4
+; VI-NEXT:    s_addc_u32 s1, s1, fastcc at gotpcrel32@hi+12
+; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; VI-NEXT:    s_mov_b64 s[0:1], s[88:89]
+; VI-NEXT:    s_mov_b64 s[2:3], s[90:91]
+; VI-NEXT:    v_mov_b32_e32 v0, 1.0
+; VI-NEXT:    s_mov_b32 s32, 0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; VI-NEXT:    flat_store_dword v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: call_fastcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_getpc_b64 s[0:1]
+; GFX11-NEXT:    s_add_u32 s0, s0, fastcc at gotpcrel32@lo+4
+; GFX11-NEXT:    s_addc_u32 s1, s1, fastcc at gotpcrel32@hi+12
+; GFX11-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s32, 0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %val = call float @fastcc(float 1.0)
   store float %val, ptr addrspace(1) undef
   ret void
 }
 
 ; Mesa compute shader: check for 47176 (COMPUTE_PGM_RSRC1) in .AMDGPU.config
-; GCN-LABEL: .AMDGPU.config
-; GCN: .long  47176
-; GCN-LABEL: {{^}}cs_mesa:
 define amdgpu_cs half @cs_mesa(half %arg0) {
+; SI-LABEL: cs_mesa:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: cs_mesa:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: cs_mesa:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config
-; GCN-LABEL: .AMDGPU.config
-; GCN: .long  45096
-; GCN-LABEL: {{^}}ps_mesa_f16:
 define amdgpu_ps half @ps_mesa_f16(half %arg0) {
+; SI-LABEL: ps_mesa_f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ps_mesa_f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ps_mesa_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; Mesa vertex shader: check for 45352 (SPI_SHADER_PGM_RSRC1_VS) in .AMDGPU.config
-; GCN-LABEL: .AMDGPU.config
-; GCN: .long  45352
-; GCN-LABEL: {{^}}vs_mesa:
 define amdgpu_vs half @vs_mesa(half %arg0) {
+; SI-LABEL: vs_mesa:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: vs_mesa:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: vs_mesa:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; Mesa geometry shader: check for 45608 (SPI_SHADER_PGM_RSRC1_GS) in .AMDGPU.config
-; GCN-LABEL: .AMDGPU.config
-; GCN: .long  45608
-; GCN-LABEL: {{^}}gs_mesa:
 define amdgpu_gs half @gs_mesa(half %arg0) {
+; SI-LABEL: gs_mesa:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: gs_mesa:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: gs_mesa:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; Mesa hull shader: check for 46120 (SPI_SHADER_PGM_RSRC1_HS) in .AMDGPU.config
-; GCN-LABEL: .AMDGPU.config
-; GCN: .long  46120
-; GCN-LABEL: {{^}}hs_mesa:
 define amdgpu_hs half @hs_mesa(half %arg0) {
+; SI-LABEL: hs_mesa:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: hs_mesa:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: hs_mesa:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 ; FIXME: Inconsistent ABI between targets
-; GCN-LABEL: {{^}}ps_mesa_v2f16:
-; VI: v_mov_b32_e32 v1, 0x3c00
-; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT: v_add_f16_e32 v0, 1.0, v0
-; VI-NEXT: v_or_b32_e32 v0, v0, v1
-; VI-NEXT: ; return
-
-; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], v0
-; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], v1
-; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
-; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
-; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
-; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
-; SI: ; return to shader part epilog
+
 define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
+; SI-LABEL: ps_mesa_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ps_mesa_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT:    v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; VI-NEXT:    v_or_b32_e32 v0, v0, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ps_mesa_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
   ret <2 x half> %add
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_v2f16:
-; VI: s_lshr_b32 s1, s0, 16
-; VI-NEXT: v_mov_b32_e32 v0, s1
-; VI-NEXT: v_mov_b32_e32 v1, 0x3c00
-; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: v_add_f16_e64 v1, s0, 1.0
-; VI-NEXT: v_or_b32_e32 v0, v1, v0
-; VI-NEXT: ; return to shader part epilog
-
-; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], s0
-; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], s1
-; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]]
-; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]]
-; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]]
-; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]]
-; SI: ; return to shader part epilog
 define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v2
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ps_mesa_inreg_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_lshr_b32 s1, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; VI-NEXT:    v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e64 v1, s0, 1.0
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ps_mesa_inreg_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
   ret <2 x half> %add
 }
 
-; GCN-LABEL: {{^}}ps_mesa_v2i16:
-; VI: v_mov_b32_e32 v2, 1
-; VI: v_add_u16_e32 v1, 1, v0
-; VI: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_or_b32_e32 v0, v1, v0
-
-
-; SI: v_lshlrev_b32_e32 v1, 16, v1
-; SI: v_add_i32_e32 v0, vcc, 1, v0
-; SI: v_and_b32
-; SI: v_or_b32
-; SI: v_add_i32_e32 v0, vcc, 0x10000, v0
 define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
+; SI-LABEL: ps_mesa_v2i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 0x10000, v0
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v2, 1
+; VI-NEXT:    v_add_u16_e32 v1, 1, v0
+; VI-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-NEXT:    flat_store_dword v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_sub_u16 v0, v0, -1 op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add <2 x i16> %arg0, <i16 1, i16 1>
   store <2 x i16> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16:
-; VI: s_and_b32 s1, s0, 0xffff0000
-; VI: s_add_i32 s0, s0, 1
-; VI: s_and_b32 s0, s0, 0xffff
-; VI: s_or_b32 s0, s1, s0
-; VI: s_add_i32 s0, s0, 0x10000
-; VI: v_mov_b32_e32 v0, s0
-
-; SI: s_lshl_b32 s1, s1, 16
-; SI: s_add_i32 s0, s0, 1
-; SI: s_and_b32 s0, s0, 0xffff
-; SI: s_or_b32 s0, s1, s0
-; SI: s_add_i32 s0, s0, 0x10000
 define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_v2i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_lshl_b32 s1, s1, 16
+; SI-NEXT:    s_add_i32 s0, s0, 1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_and_b32 s0, s0, 0xffff
+; SI-NEXT:    s_or_b32 s0, s1, s0
+; SI-NEXT:    s_add_i32 s0, s0, 0x10000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_inreg_v2i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_and_b32 s1, s0, 0xffff0000
+; VI-NEXT:    s_add_i32 s0, s0, 1
+; VI-NEXT:    s_and_b32 s0, s0, 0xffff
+; VI-NEXT:    s_or_b32 s0, s1, s0
+; VI-NEXT:    s_add_i32 s0, s0, 0x10000
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_dword v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_inreg_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_sub_u16 v0, s0, -1 op_sel_hi:[1,0]
+; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add <2 x i16> %arg0, <i16 1, i16 1>
   store <2 x i16> %add, ptr addrspace(1) undef
   ret void
 }
 
 ; FIXME: Differenet ABI for VI+
-; GCN-LABEL: {{^}}ps_mesa_v4f16:
-; SI: v_cvt_f16_f32_e32 v3, v3
-; SI: v_cvt_f16_f32_e32 v2, v2
-; SI: v_cvt_f16_f32_e32 v1, v1
-; SI: v_cvt_f16_f32_e32 v0, v0
-
-; VI: v_add_f16_e32 v2, 1.0, v1
-; VI: v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI: v_add_f16_e32 v4, 1.0, v0
-; VI: v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+
 define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) {
+; SI-LABEL: ps_mesa_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v2
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v3
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ps_mesa_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; VI-NEXT:    v_add_f16_e32 v2, 1.0, v1
+; VI-NEXT:    v_add_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e32 v4, 1.0, v0
+; VI-NEXT:    v_add_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v4, v0
+; VI-NEXT:    v_or_b32_e32 v1, v2, v1
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ps_mesa_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
   ret <4 x half> %add
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_v4f16:
-; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s3
-; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s2
-; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s1
-; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, s0
-
-; VI: v_add_f16_e64
-; VI: v_add_f16_sdwa
-; VI: v_add_f16_e64
-; VI: v_add_f16_sdwa
 define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_v4f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, s2
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, s1
+; SI-NEXT:    v_cvt_f16_f32_e32 v3, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_add_f32_e32 v1, 1.0, v1
+; SI-NEXT:    v_add_f32_e32 v2, 1.0, v5
+; SI-NEXT:    v_add_f32_e32 v3, 1.0, v4
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ps_mesa_inreg_v4f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f16_e64 v1, s1, 1.0
+; VI-NEXT:    s_lshr_b32 s1, s1, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; VI-NEXT:    v_add_f16_sdwa v3, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_add_f16_e64 v0, s0, 1.0
+; VI-NEXT:    s_lshr_b32 s0, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_add_f16_sdwa v2, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-NEXT:    v_or_b32_e32 v0, v0, v2
+; VI-NEXT:    v_or_b32_e32 v1, v1, v3
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ps_mesa_inreg_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0]
+; GFX11-NEXT:    ; return to shader part epilog
   %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
   ret <4 x half> %add
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_v3i32:
-; GCN-DAG: s_add_i32 s0, s0, 1
-; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2
-; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3
 define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_v3i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_add_i32 s1, s1, 2
+; SI-NEXT:    s_add_i32 s0, s0, 1
+; SI-NEXT:    s_add_i32 s4, s2, 3
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_inreg_v3i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s2, s2, 3
+; VI-NEXT:    s_add_i32 s1, s1, 2
+; VI-NEXT:    s_add_i32 s0, s0, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_inreg_v3i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    s_add_i32 s1, s1, 2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
   store <3 x i32> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_v3f32:
-; GCN-DAG: v_add_f32{{.*}}, s0, 1.0
-; GCN-DAG: v_add_f32{{.*}}, s1, 2.0
-; GCN-DAG: v_add_f32{{.*}}, s2, 4.0
 define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_v3f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_add_f32_e64 v1, s1, 2.0
+; SI-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; SI-NEXT:    v_add_f32_e64 v2, s2, 4.0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_inreg_v3f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f32_e64 v2, s2, 4.0
+; VI-NEXT:    v_add_f32_e64 v1, s1, 2.0
+; VI-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_inreg_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f32_e64 v2, s2, 4.0
+; GFX11-NEXT:    v_add_f32_e64 v1, s1, 2.0
+; GFX11-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_v5i32:
-; GCN-DAG: s_add_i32 s0, s0, 1
-; GCN-DAG: s_add_i32 s{{[0-9]*}}, s1, 2
-; GCN-DAG: s_add_i32 s{{[0-9]*}}, s2, 3
-; GCN-DAG: s_add_i32 s{{[0-9]*}}, s3, 4
-; GCN-DAG: s_add_i32 s{{[0-9]*}}, s4, 5
 define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_v5i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_add_i32 s5, s3, 4
+; SI-NEXT:    s_add_i32 s6, s2, 3
+; SI-NEXT:    s_add_i32 s1, s1, 2
+; SI-NEXT:    s_add_i32 s0, s0, 1
+; SI-NEXT:    s_add_i32 s4, s4, 5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    v_mov_b32_e32 v1, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v3, s5
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_inreg_v5i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s4, s4, 5
+; VI-NEXT:    s_add_i32 s3, s3, 4
+; VI-NEXT:    s_add_i32 s2, s2, 3
+; VI-NEXT:    s_add_i32 s1, s1, 2
+; VI-NEXT:    s_add_i32 s0, s0, 1
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    flat_store_dword v[0:1], v0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_inreg_v5i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_add_i32 s3, s3, 4
+; GFX11-NEXT:    s_add_i32 s2, s2, 3
+; GFX11-NEXT:    s_add_i32 s1, s1, 2
+; GFX11-NEXT:    s_add_i32 s4, s4, 5
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_mov_b32_e32 v2, s2
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
   store <5 x i32> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_v5f32:
-; GCN-DAG: v_add_f32{{.*}}, s0, 1.0
-; GCN-DAG: v_add_f32{{.*}}, s1, 2.0
-; GCN-DAG: v_add_f32{{.*}}, s2, 4.0
-; GCN-DAG: v_add_f32{{.*}}, s3, -1.0
-; GCN-DAG: v_add_f32{{.*}}, s4, 0.5
 define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_v5f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_add_f32_e64 v3, s3, -1.0
+; SI-NEXT:    v_add_f32_e64 v2, s2, 4.0
+; SI-NEXT:    v_add_f32_e64 v1, s1, 2.0
+; SI-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; SI-NEXT:    v_add_f32_e64 v4, s4, 0.5
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_inreg_v5f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f32_e64 v3, s3, -1.0
+; VI-NEXT:    v_add_f32_e64 v2, s2, 4.0
+; VI-NEXT:    v_add_f32_e64 v1, s1, 2.0
+; VI-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; VI-NEXT:    v_add_f32_e64 v4, s4, 0.5
+; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_inreg_v5f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_f32_e64 v3, s3, -1.0
+; GFX11-NEXT:    v_add_f32_e64 v2, s2, 4.0
+; GFX11-NEXT:    v_add_f32_e64 v1, s1, 2.0
+; GFX11-NEXT:    v_add_f32_e64 v4, s4, 0.5
+; GFX11-NEXT:    v_add_f32_e64 v0, s0, 1.0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
   store <5 x float> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_v3i32:
-; GCN-DAG: v_add_{{.*}}, 1, v0
-; GCN-DAG: v_add_{{.*}}, 2, v1
-; GCN-DAG: v_add_{{.*}}, 3, v2
 define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
+; SI-LABEL: ps_mesa_v3i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_v3i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_v3i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
   store <3 x i32> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_v3f32:
-; GCN-DAG: v_add_{{.*}}, 1.0, v0
-; GCN-DAG: v_add_{{.*}}, 2.0, v1
-; GCN-DAG: v_add_{{.*}}, 4.0, v2
 define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
+; SI-LABEL: ps_mesa_v3f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_v3f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; VI-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-NEXT:    flat_store_dwordx3 v[0:1], v[0:2]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1
+; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_v5i32:
-; GCN-DAG: v_add_{{.*}}, 1, v0
-; GCN-DAG: v_add_{{.*}}, 2, v1
-; GCN-DAG: v_add_{{.*}}, 3, v2
-; GCN-DAG: v_add_{{.*}}, 4, v3
-; GCN-DAG: v_add_{{.*}}, 5, v4
 define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
+; SI-LABEL: ps_mesa_v5i32:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_add_i32_e32 v3, vcc, 4, v3
+; SI-NEXT:    v_add_i32_e32 v2, vcc, 3, v2
+; SI-NEXT:    v_add_i32_e32 v1, vcc, 2, v1
+; SI-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 5, v4
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_v5i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u32_e32 v3, vcc, 4, v3
+; VI-NEXT:    v_add_u32_e32 v2, vcc, 3, v2
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 2, v1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 1, v0
+; VI-NEXT:    v_add_u32_e32 v4, vcc, 5, v4
+; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_v5i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, 4, v3
+; GFX11-NEXT:    v_add_nc_u32_e32 v2, 3, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, 2, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v4, 5, v4
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
   store <5 x i32> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_v5f32:
-; GCN-DAG: v_add_f32{{.*}}, 1.0, v0
-; GCN-DAG: v_add_f32{{.*}}, 2.0, v1
-; GCN-DAG: v_add_f32{{.*}}, 4.0, v2
-; GCN-DAG: v_add_f32{{.*}}, -1.0, v3
-; GCN-DAG: v_add_f32{{.*}}, 0.5, v4
 define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
+; SI-LABEL: ps_mesa_v5f32:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_add_f32_e32 v3, -1.0, v3
+; SI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; SI-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; SI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_add_f32_e32 v4, 0.5, v4
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], 0
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_v5f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_f32_e32 v3, -1.0, v3
+; VI-NEXT:    v_add_f32_e32 v2, 4.0, v2
+; VI-NEXT:    v_add_f32_e32 v1, 2.0, v1
+; VI-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; VI-NEXT:    v_add_f32_e32 v4, 0.5, v4
+; VI-NEXT:    flat_store_dword v[0:1], v4
+; VI-NEXT:    flat_store_dwordx4 v[0:1], v[0:3]
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_v5f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
+; GFX11-NEXT:    v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
+; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
   store <5 x float> %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_i16:
-; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, v0, v0
-; VI: v_add_u16_e32 v{{[0-9]+}}, v0, v0
 define amdgpu_ps void @ps_mesa_i16(i16 %arg0) {
+; SI-LABEL: ps_mesa_i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    v_add_u16_e32 v0, v0, v0
+; VI-NEXT:    flat_store_short v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_add_nc_u16 v0, v0, v0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add i16 %arg0, %arg0
   store i16 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ps_mesa_inreg_i16:
-; GCN: s_add_i32 s{{[0-9]+}}, s0, s0
 define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
+; SI-LABEL: ps_mesa_inreg_i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_add_i32 s0, s0, s0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: ps_mesa_inreg_i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_and_b32 s0, 0xffff, s0
+; VI-NEXT:    s_add_i32 s0, s0, s0
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    flat_store_short v[0:1], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: ps_mesa_inreg_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s0, s0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %add = add i16 %arg0, %arg0
   store i16 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}ret_ps_mesa_i16:
-; GCN: s_movk_i32 s0, 0x7b
 define amdgpu_ps i16 @ret_ps_mesa_i16() {
+; GCN-LABEL: ret_ps_mesa_i16:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_movk_i32 s0, 0x7b
+; GCN-NEXT:    ; return to shader part epilog
   ret i16 123
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 9b91b06fddeff..3a0ad1d56be45 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -1,14 +1,54 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
 
-; GCN-LABEL: {{^}}extract_vector_elt_v2f16:
-; GCN: s_load_dword [[VEC:s[0-9]+]]
-; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
-; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]
-; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
-; GCN-DAG: buffer_store_short [[VELT0]]
-; GCN-DAG: buffer_store_short [[VELT1]]
 define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
+; SI-LABEL: extract_vector_elt_v2f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s4, s[2:3], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s5, s4, 16
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:20
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s5, s4, 16
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_short v1, off, s[0:3], 0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:20
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: extract_vector_elt_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_store_d16_hi_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 offset:20
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %p0 = extractelement <2 x half> %vec, i32 0
   %p1 = extractelement <2 x half> %vec, i32 1
@@ -18,34 +58,124 @@ define amdgpu_kernel void @extract_vector_elt_v2f16(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_sgpr:
-; GCN: s_load_dword [[IDX:s[0-9]+]]
-; GCN: s_load_dword [[VEC:s[0-9]+]]
-; GCN: s_lshl_b32 [[IDX_SCALED:s[0-9]+]], [[IDX]], 4
-; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], [[IDX_SCALED]]
-; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
-; GCN: buffer_store_short [[VELT1]]
-; GCN: ScratchSize: 0
 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
+; SI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dword s0, s[0:1], 0xd
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s1, s[6:7], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_lshl_b32 s0, s0, 4
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s0, s1, s0
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v2f16_dynamic_sgpr:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dword s8, s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_lshl_b32 s4, s8, 4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s4, s6, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_sgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s1, s[6:7], 0x0
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 4
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s0, s1, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %vec = load <2 x half>, ptr addrspace(4) %vec.ptr
   %elt = extractelement <2 x half> %vec, i32 %idx
   store half %elt, ptr addrspace(1) %out, align 2
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v2f16_dynamic_vgpr:
-; GCN-DAG: s_load_dword [[VEC:s[0-9]+]]
-; GCN-DAG: {{flat|buffer}}_load_dword [[IDX:v[0-9]+]]
-; GCN: v_lshlrev_b32_e32 [[IDX_SCALED:v[0-9]+]], 4, [[IDX]]
-
-; SI: v_lshr_b32_e32 [[ELT:v[0-9]+]], [[VEC]], [[IDX_SCALED]]
-; VI: v_lshrrev_b32_e64 [[ELT:v[0-9]+]], [[IDX_SCALED]], [[VEC]]
-
-
-; SI: buffer_store_short [[ELT]]
-; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
-; GCN: ScratchSize: 0{{$}}
 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, ptr addrspace(1) %idx.ptr) #0 {
+; SI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_load_dword v3, v[1:2], s[0:3], 0 addr64
+; SI-NEXT:    s_load_dword s6, s[6:7], 0x0
+; SI-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v3
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_lshr_b32_e32 v0, s6, v0
+; SI-NEXT:    buffer_store_short v0, v[1:2], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    flat_load_dword v2, v[1:2]
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_load_dword s1, s[2:3], 0x0
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e64 v2, v2, s1
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: extract_vector_elt_v2f16_dynamic_vgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, ptr addrspace(1) %idx.ptr, i64 %tid.ext
@@ -57,12 +187,50 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(ptr addrspace(1
   ret void
 }
 
-; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
-; GCN: s_load_dwordx4
-
-; GCN: buffer_store_short
-; GCN: buffer_store_short
 define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo) #0 {
+; SI-LABEL: extract_vector_elt_v3f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s3
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:2
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: extract_vector_elt_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s3
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:2
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: extract_vector_elt_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s2
+; GFX11-NEXT:    s_mov_b32 s4, s0
+; GFX11-NEXT:    s_mov_b32 s5, s1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 offset:2
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
   %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
@@ -72,35 +240,106 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x
 }
 
 ; FIXME: Why sometimes vector shift?
-; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
-; SI: s_load_dword s
-; SI: s_load_dwordx4 s
-
-; GFX89: s_load_dwordx4 s
-; GFX89: s_load_dword s
-
-
-; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
-; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-
-; GCN: {{buffer|global}}_store_short
 define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(ptr addrspace(1) %out, <3 x half> %foo, i32 %idx) #0 {
+; SI-LABEL: dynamic_extract_vector_elt_v3f16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshl_b32 s4, s4, 4
+; SI-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: dynamic_extract_vector_elt_v3f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dword s8, s[0:1], 0x34
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s0, s4
+; VI-NEXT:    s_lshl_b32 s4, s8, 4
+; VI-NEXT:    s_mov_b32 s1, s5
+; VI-NEXT:    s_lshr_b64 s[4:5], s[6:7], s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: dynamic_extract_vector_elt_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x34
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    v_mov_b32_e32 v0, s2
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, ptr addrspace(1) %out, i32 1
   store half %p0, ptr addrspace(1) %out
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v4f16_2:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: buffer_store_short [[LOAD]]
-
-; VI: flat_load_dword v
-; VI: flat_store_short
-
-; GFX9: global_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off offset:4
-; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]]
 define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: v_extractelement_v4f16_2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dword v3, v[1:2], s[8:11], 0 addr64 offset:4
+; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_extractelement_v4f16_2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    v_add_u32_e32 v1, vcc, 4, v1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    flat_load_dword v2, v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
+; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_store_short v[0:1], v2
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_extractelement_v4f16_2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v1, v1, s[2:3] offset:4
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -111,17 +350,69 @@ define amdgpu_kernel void @v_extractelement_v4f16_2(ptr addrspace(1) %out, ptr a
   ret void
 }
 
-; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_vgpr:
-; GCN-DAG: {{flat|global|buffer}}_load_dword [[IDX:v[0-9]+]],
-; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v[[[LO:[0-9]+]]:[[HI:[0-9]+]]]
-; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-
-; GFX89: v_lshrrev_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], [[SCALED_IDX]], v[[[LO]]:[[HI]]]
-; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[SHIFT_LO]]
-
-; SI: v_lshr_b64 v[[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]], v[[[LO]]:[[HI]]], [[SCALED_IDX]]
-; SI: buffer_store_short v[[SHIFT_LO]]
 define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
+; SI-LABEL: v_insertelement_v4f16_dynamic_vgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v2, 0
+; SI-NEXT:    s_mov_b32 s10, -1
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    buffer_load_dword v5, off, s[8:11], 0 glc
+; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
+; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v5
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshr_b64 v[3:4], v[3:4], v0
+; SI-NEXT:    buffer_store_short v3, v[1:2], s[0:3], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_insertelement_v4f16_dynamic_vgpr:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], 0 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v3
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b64 v[0:1], v0, v[1:2]
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s0, v4
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v5, vcc
+; VI-NEXT:    flat_store_short v[1:2], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_insertelement_v4f16_dynamic_vgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s6, -1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX11-NEXT:    buffer_load_b32 v3, off, s[4:7], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[1:2], v1, s[2:3]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v3, v[1:2]
+; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -133,13 +424,58 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(ptr addrspace(1) %
   ret void
 }
 
-; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_01:
-; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], 0x0
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4) %ptr) #0 {
+; SI-LABEL: reduce_load_vector_v8f16_extract_01:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s0, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: reduce_load_vector_v8f16_extract_01:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s1, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: reduce_load_vector_v8f16_extract_01:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %load = load <16 x half>, ptr addrspace(4) %ptr
   %elt0 = extractelement <16 x half> %load, i32 0
   %elt1 = extractelement <16 x half> %load, i32 1
@@ -148,13 +484,58 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_01(ptr addrspace(4)
   ret void
 }
 
-; GCN-LABEL: {{^}}reduce_load_vector_v8f16_extract_23:
-; GCN: s_load_dwordx2 [[PTR:s\[[0-9]+:[0-9]+\]]],
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_load_dword s{{[0-9]+}}, [[PTR]], {{0x1|0x4}}
-; GCN-NOT: {{s|buffer|flat|global}}_load_
-; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
 define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4) %ptr) #0 {
+; SI-LABEL: reduce_load_vector_v8f16_extract_23:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dword s0, s[0:1], 0x1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_lshr_b32 s1, s0, 16
+; SI-NEXT:    v_mov_b32_e32 v0, s0
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: reduce_load_vector_v8f16_extract_23:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x4
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_lshr_b32 s1, s0, 16
+; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: reduce_load_vector_v8f16_extract_23:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x4
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %load = load <16 x half>, ptr addrspace(4) %ptr
   %elt2 = extractelement <16 x half> %load, i32 2
   %elt3 = extractelement <16 x half> %load, i32 3
@@ -163,9 +544,143 @@ define amdgpu_kernel void @reduce_load_vector_v8f16_extract_23(ptr addrspace(4)
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v8f16_dynamic_sgpr:
-; GCN-COUNT-7: v_cndmask_b32_e32
 define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
+; SI-LABEL: v_extractelement_v8f16_dynamic_sgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; SI-NEXT:    v_mov_b32_e32 v5, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; SI-NEXT:    buffer_load_dwordx4 v[1:4], v[4:5], s[0:3], 0 addr64
+; SI-NEXT:    v_lshlrev_b32_e32 v6, 1, v0
+; SI-NEXT:    v_mov_b32_e32 v7, v5
+; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT:    s_cmp_eq_u32 s8, 1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v9, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 3
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 4
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 5
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 6
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 7
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, v[6:7], s[4:7], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_extractelement_v8f16_dynamic_sgpr:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_add_u32_e32 v1, vcc, s6, v1
+; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; VI-NEXT:    flat_load_dwordx4 v[1:4], v[1:2]
+; VI-NEXT:    v_mov_b32_e32 v6, s5
+; VI-NEXT:    v_add_u32_e32 v5, vcc, s4, v0
+; VI-NEXT:    s_cmp_eq_u32 s0, 1
+; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 2
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 3
+; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 5
+; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 6
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 7
+; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; VI-NEXT:    flat_store_short v[5:6], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_extractelement_v8f16_dynamic_sgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    global_load_b128 v[1:4], v1, s[6:7]
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 1
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 2
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 7
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <8 x half>, ptr addrspace(1) %in, i64 %tid.ext
@@ -176,9 +691,248 @@ define amdgpu_kernel void @v_extractelement_v8f16_dynamic_sgpr(ptr addrspace(1)
   ret void
 }
 
-; GCN-LABEL: {{^}}v_extractelement_v16f16_dynamic_sgpr:
-; GCN-COUNT-15: v_cndmask_b32_e32
 define amdgpu_kernel void @v_extractelement_v16f16_dynamic_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %n) #0 {
+; SI-LABEL: v_extractelement_v16f16_dynamic_sgpr:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; SI-NEXT:    s_load_dword s8, s[0:1], 0xd
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v5, 5, v0
+; SI-NEXT:    v_mov_b32_e32 v6, 0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[0:1], s[6:7]
+; SI-NEXT:    buffer_load_dwordx4 v[1:4], v[5:6], s[0:3], 0 addr64
+; SI-NEXT:    v_lshlrev_b32_e32 v9, 1, v0
+; SI-NEXT:    v_mov_b32_e32 v10, v6
+; SI-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; SI-NEXT:    buffer_load_dwordx4 v[5:8], v[5:6], s[0:3], 0 addr64 offset:16
+; SI-NEXT:    s_cmp_eq_u32 s8, 1
+; SI-NEXT:    s_waitcnt vmcnt(1)
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_cvt_f32_f16_e32 v11, v2
+; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v12, v3
+; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v13, v4
+; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_cvt_f32_f16_e32 v14, v5
+; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v15, v6
+; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v16, v7
+; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v17, v8
+; SI-NEXT:    v_lshrrev_b32_e32 v8, 16, v8
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 2
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 3
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 4
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 5
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 6
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 7
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 8
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 9
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 10
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 11
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 12
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 13
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 14
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    s_cmp_eq_u32 s8, 15
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v17, vcc
+; SI-NEXT:    s_cselect_b64 vcc, -1, 0
+; SI-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT:    buffer_store_short v0, v[9:10], s[4:7], 0 addr64
+; SI-NEXT:    s_endpgm
+;
+; VI-LABEL: v_extractelement_v16f16_dynamic_sgpr:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; VI-NEXT:    s_load_dword s0, s[0:1], 0x34
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_add_u32_e32 v5, vcc, s6, v1
+; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
+; VI-NEXT:    flat_load_dwordx4 v[1:4], v[5:6]
+; VI-NEXT:    v_add_u32_e32 v5, vcc, 16, v5
+; VI-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; VI-NEXT:    flat_load_dwordx4 v[5:8], v[5:6]
+; VI-NEXT:    v_mov_b32_e32 v10, s5
+; VI-NEXT:    v_add_u32_e32 v9, vcc, s4, v0
+; VI-NEXT:    s_cmp_eq_u32 s0, 1
+; VI-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 2
+; VI-NEXT:    s_waitcnt vmcnt(1)
+; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 3
+; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v11, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 5
+; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v3
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 6
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v12, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 7
+; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 8
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v13, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 9
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v5
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 10
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 11
+; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 12
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 13
+; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v7
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 14
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v15, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s0, 15
+; VI-NEXT:    v_lshrrev_b32_e32 v16, 16, v8
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc
+; VI-NEXT:    s_cselect_b64 vcc, -1, 0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
+; VI-NEXT:    flat_store_short v[9:10], v0
+; VI-NEXT:    s_endpgm
+;
+; GFX11-LABEL: v_extractelement_v16f16_dynamic_sgpr:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 5, v0
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x34
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    global_load_b128 v[1:4], v5, s[6:7]
+; GFX11-NEXT:    global_load_b128 v[5:8], v5, s[6:7] offset:16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 1
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 2
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 3
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v9, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 5
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v4
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 7
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v5
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 9
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 11
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 12
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 13
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 15
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc_lo
+; GFX11-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    global_store_b16 v0, v1, s[4:5]
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <16 x half>, ptr addrspace(1) %in, i64 %tid.ext


        


More information about the llvm-commits mailing list