[llvm] 813f6a4 - [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 12.

Ivan Kosarev via llvm-commits llvm-commits at lists.llvm.org
Fri Jun 23 05:33:12 PDT 2023


Author: Ivan Kosarev
Date: 2023-06-23T13:33:06+01:00
New Revision: 813f6a495bb695c46d5d18a35dd9b6db3e83af22

URL: https://github.com/llvm/llvm-project/commit/813f6a495bb695c46d5d18a35dd9b6db3e83af22
DIFF: https://github.com/llvm/llvm-project/commit/813f6a495bb695c46d5d18a35dd9b6db3e83af22.diff

LOG: [AMDGPU][GFX11] Add test coverage for 16-bit conversions, part 12.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D152905

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/function-args.ll
    llvm/test/CodeGen/AMDGPU/function-returns.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 78065579fc79b..1c7d28a6580c8 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -1,44 +1,148 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,CI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CIGFX89,GFX89,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
 
-; GCN-LABEL: {{^}}void_func_i1:
-; GCN: v_and_b32_e32 v0, 1, v0
-; GCN: buffer_store_byte v0, off
 define void @void_func_i1(i1 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i1:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i1 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i1_zeroext:
-; GCN: s_waitcnt
-; GCN-NEXT: v_or_b32_e32 v0, 12, v0
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0, off
 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
+; CIGFX89-LABEL: void_func_i1_zeroext:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_or_b32_e32 v0, 12, v0
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_or_b32_e32 v0, 12, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i1 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i1_signext:
-; GCN: s_waitcnt
-; GCN-NEXT: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0, off
 define void @void_func_i1_signext(i1 signext %arg0) #0 {
+; CI-LABEL: void_func_i1_signext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_i1_signext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_i1_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i1_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i1 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}i1_arg_i1_use:
-; GCN: v_and_b32_e32 v0, 1, v0
-; GCN: v_cmp_eq_u32_e32 vcc, 1, v0
-; GCN: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc, -1
 define void @i1_arg_i1_use(i1 %arg) #0 {
+; CIGFX89-LABEL: i1_arg_i1_use:
+; CIGFX89:       ; %bb.0: ; %bb
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    v_and_b32_e32 v0, 1, v0
+; CIGFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CIGFX89-NEXT:    s_xor_b64 s[6:7], vcc, -1
+; CIGFX89-NEXT:    s_and_saveexec_b64 s[4:5], s[6:7]
+; CIGFX89-NEXT:    s_cbranch_execz .LBB3_2
+; CIGFX89-NEXT:  ; %bb.1: ; %bb1
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    v_mov_b32_e32 v0, 0
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:  .LBB3_2: ; %bb2
+; CIGFX89-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_arg_i1_use:
+; GFX11:       ; %bb.0: ; %bb
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    s_xor_b32 s1, vcc_lo, -1
+; GFX11-NEXT:    s_and_saveexec_b32 s0, s1
+; GFX11-NEXT:    s_cbranch_execz .LBB3_2
+; GFX11-NEXT:  ; %bb.1: ; %bb1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:  .LBB3_2: ; %bb2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
   br i1 %arg, label %bb2, label %bb1
 
@@ -50,304 +154,1139 @@ bb2:
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i8:
-; GCN-NOT: v0
-; GCN: buffer_store_byte v0, off
 define void @void_func_i8(i8 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i8:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i8 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i8_zeroext:
-; GCN-NOT: and_b32
-; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0
 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
+; CI-LABEL: void_func_i8_zeroext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_i8_zeroext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_i8_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i8 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i8_signext:
-; GCN-NOT: v_bfe_i32
-; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0
 define void @void_func_i8_signext(i8 signext %arg0) #0 {
+; CI-LABEL: void_func_i8_signext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_i8_signext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_i8_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i8_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i8 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i16:
-; GCN: buffer_store_short v0, off
 define void @void_func_i16(i16 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i16:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i16 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i16_zeroext:
-; GCN-NOT: v0
-; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0
 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
+; CI-LABEL: void_func_i16_zeroext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_i16_zeroext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_i16_zeroext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16_zeroext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = zext i16 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i16_signext:
-; GCN-NOT: v0
-; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}12, v0
 define void @void_func_i16_signext(i16 signext %arg0) #0 {
+; CI-LABEL: void_func_i16_signext:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 12, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_i16_signext:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, 12, v0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_i16_signext:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, 12, v0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i16_signext:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, 12, v0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ext = sext i16 %arg0 to i32
   %add = add i32 %ext, 12
   store i32 %add, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i32:
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0, off
 define void @void_func_i32(i32 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i32 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_i64:
-; GCN-NOT: v[0:1]
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_i64(i64 %arg0) #0 {
+; CIGFX89-LABEL: void_func_i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store i64 %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_f16:
-; VI-NOT: v0
-; CI: v_cvt_f16_f32_e32 v0, v0
-; GCN: buffer_store_short v0, off
 define void @void_func_f16(half %arg0) #0 {
+; CI-LABEL: void_func_f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store half %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_f32
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0, off
 define void @void_func_f32(float %arg0) #0 {
+; CIGFX89-LABEL: void_func_f32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store float %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_f64:
-; GCN-NOT: v[0:1]
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_f64(double %arg0) #0 {
+; CIGFX89-LABEL: void_func_f64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store double %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v2i32:
-; GCN-NOT: v[0:1]
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v2i32(<2 x i32> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v2i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v3i32:
-; GCN-DAG: buffer_store_dwordx3 v[0:2], off
 define void @void_func_v3i32(<3 x i32> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v3i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v4i32:
-; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v4i32(<4 x i32> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v4i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v5i32:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dword v4, off
 define void @void_func_v5i32(<5 x i32> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v5i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v8i32:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v8i32(<8 x i32> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v8i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v16i32:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v16i32(<16 x i32> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v16i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
-; GCN-DAG: buffer_store_dwordx4 v[16:19], off
-; GCN-DAG: buffer_store_dwordx4 v[20:23], off
-; GCN-DAG: buffer_store_dwordx4 v[24:27], off
-; GCN-DAG: buffer_store_dwordx4 v[28:31], off
 define void @void_func_v32i32(<32 x i32> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v32i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(6)
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <32 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; 1 over register limit
-; GCN-LABEL: {{^}}void_func_v33i32:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
-; GCN-DAG: buffer_load_dword v31, off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s32 offset:4
-; GCN-DAG: buffer_store_dwordx4 v[16:19], off
-; GCN-DAG: buffer_store_dwordx4 v[20:23], off
-; GCN-DAG: buffer_store_dwordx4 v[24:27], off
-; GCN-DAG: buffer_store_dwordx4 v[28:31], off
-; GCN: buffer_store_dword [[STACKLOAD]], off
 define void @void_func_v33i32(<33 x i32> %arg0) #0 {
+; CI-LABEL: void_func_v33i32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v33i32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(5)
+; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v33i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v33i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <33 x i32> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v2i64:
-; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v2i64(<2 x i64> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v2i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v3i64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx2 v[4:5], off
 define void @void_func_v3i64(<3 x i64> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v3i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v4i64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v4i64(<4 x i64> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v4i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v5i64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx2 v[8:9], off
 define void @void_func_v5i64(<5 x i64> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v5i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[8:9], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b64 v[8:9], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v8i64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v8i64(<8 x i64> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v8i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v16i64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
-; GCN-DAG: buffer_store_dwordx4 v[16:19], off
-; GCN-DAG: buffer_store_dwordx4 v[20:23], off
-; GCN-DAG: buffer_store_dwordx4 v[24:27], off
-; GCN-DAG: buffer_store_dwordx4 v[28:31], off
 define void @void_func_v16i64(<16 x i64> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v16i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(6)
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i64> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v2i16:
-; GFX9-NOT: v0
-; GFX9: buffer_store_dword v0, off
 define void @void_func_v2i16(<2 x i16> %arg0) #0 {
+; CI-LABEL: void_func_v2i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v2i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v3i16:
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
 define void @void_func_v3i16(<3 x i16> %arg0) #0 {
+; CI-LABEL: void_func_v3i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_short v2, off, s[4:7], 0
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v3i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v1, off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v4i16:
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v4i16(<4 x i16> %arg0) #0 {
+; CI-LABEL: void_func_v4i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
+; CI-NEXT:    v_or_b32_e32 v1, v0, v1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v4i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v5i16:
-; CI: v_lshlrev_b32
-; CI: v_and_b32
-; CI: v_lshlrev_b32
-; CI: v_or_b32
-; CI: v_or_b32
-; CI-DAG: buffer_store_short v
-; CI-DAG: buffer_store_dwordx2 v
-
-; GFX89-DAG: buffer_store_short v2, off,
-; GFX89-DAG: buffer_store_dwordx2 v[0:1], off
-
 define void @void_func_v5i16(<5 x i16> %arg0) #0 {
+; CI-LABEL: void_func_v5i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_or_b32_e32 v2, v2, v3
+; CI-NEXT:    v_or_b32_e32 v1, v0, v1
+; CI-NEXT:    buffer_store_short v4, off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v5i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v2, off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v5i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b16 v2, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <5 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v8i16:
-; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v8i16(<8 x i16> %arg0) #0 {
+; CI-LABEL: void_func_v8i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; CI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v6, v6, v7
+; CI-NEXT:    v_or_b32_e32 v5, v4, v5
+; CI-NEXT:    v_or_b32_e32 v4, v2, v3
+; CI-NEXT:    v_or_b32_e32 v3, v0, v1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v8i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v16i16:
-; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
-; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v16i16(<16 x i16> %arg0) #0 {
+; CI-LABEL: void_func_v16i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; CI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; CI-NEXT:    v_or_b32_e32 v5, v4, v5
+; CI-NEXT:    v_or_b32_e32 v4, v2, v3
+; CI-NEXT:    v_or_b32_e32 v3, v0, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v15
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v14
+; CI-NEXT:    v_or_b32_e32 v14, v1, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v13
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v12
+; CI-NEXT:    v_or_b32_e32 v13, v1, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v11
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v10
+; CI-NEXT:    v_or_b32_e32 v12, v1, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v9
+; CI-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; CI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; CI-NEXT:    v_or_b32_e32 v11, v1, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_or_b32_e32 v6, v6, v7
+; CI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v16i16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x i16> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v2i24:
-; GCN: v_add_{{i|u}}32_e32 v0, {{(vcc, )?}}v0, v1
 define void @void_func_v2i24(<2 x i24> %arg0) #0 {
+; CI-LABEL: void_func_v2i24:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; CI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v2i24:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v2i24:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; GFX9-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2i24:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %elt0 = extractelement <2 x i24> %arg0, i32 0
   %elt1 = extractelement <2 x i24> %arg0, i32 1
   %add = add i24 %elt0, %elt1
@@ -355,197 +1294,734 @@ define void @void_func_v2i24(<2 x i24> %arg0) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v2f32:
-; GCN-NOT: v[0:1]
-; GCN-NOT: v0
-; GCN-NOT: v1
-; GCN: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v2f32(<2 x float> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v2f32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v3f32:
-; GCN-DAG: buffer_store_dwordx3 v[0:2], off
 define void @void_func_v3f32(<3 x float> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v3f32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx3 v[0:2], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v4f32:
-; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v4f32(<4 x float> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v4f32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v8f32:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v8f32(<8 x float> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v8f32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v16f32:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v16f32(<16 x float> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v16f32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x float> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v2f64:
-; GCN: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v2f64(<2 x double> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v2f64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v3f64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx2 v[4:5], off
 define void @void_func_v3f64(<3 x double> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v3f64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[4:5], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v4f64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v4f64(<4 x double> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v4f64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v8f64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
 define void @void_func_v8f64(<8 x double> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v8f64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v16f64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
-; GCN-DAG: buffer_store_dwordx4 v[16:19], off
-; GCN-DAG: buffer_store_dwordx4 v[20:23], off
-; GCN-DAG: buffer_store_dwordx4 v[24:27], off
-; GCN-DAG: buffer_store_dwordx4 v[28:31], off
 define void @void_func_v16f64(<16 x double> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v16f64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(6)
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x double> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v2f16:
-; GFX9-NOT: v0
-; GFX9: buffer_store_dword v0, off
 define void @void_func_v2f16(<2 x half> %arg0) #0 {
+; CI-LABEL: void_func_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v2f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <2 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; FIXME: Different abi if f16 legal
-; GCN-LABEL: {{^}}void_func_v3f16:
-; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v0
-; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v1
-; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v2
-
-; GFX89-DAG: v0
-; GFX89-DAG: v1
-
-; GCN-DAG: buffer_store_short
-; GCN-DAG: buffer_store_dword
 define void @void_func_v3f16(<3 x half> %arg0) #0 {
+; CI-LABEL: void_func_v3f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_or_b32_e32 v0, v0, v1
+; CI-NEXT:    buffer_store_short v2, off, s[4:7], 0
+; CI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v3f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_short v1, off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b16 v1, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <3 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v4f16:
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9-NOT: v[0:1]
-; GFX9: buffer_store_dwordx2 v[0:1], off
 define void @void_func_v4f16(<4 x half> %arg0) #0 {
+; CI-LABEL: void_func_v4f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; CI-NEXT:    v_or_b32_e32 v1, v2, v1
+; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; CI-NEXT:    v_or_b32_e32 v0, v0, v2
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v4f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <4 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v8f16:
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9: buffer_store_dwordx4 v[0:3], off
 define void @void_func_v8f16(<8 x half> %arg0) #0 {
+; CI-LABEL: void_func_v8f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; CI-NEXT:    v_or_b32_e32 v5, v6, v5
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v4, v4, v6
+; CI-NEXT:    v_or_b32_e32 v3, v2, v3
+; CI-NEXT:    v_or_b32_e32 v2, v0, v1
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v8f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v8f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <8 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v16f16:
-; GFX9-NOT: v0
-; GFX9-NOT: v1
-; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
-; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
 define void @void_func_v16f16(<16 x half> %arg0) #0 {
+; CI-LABEL: void_func_v16f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT:    v_or_b32_e32 v5, v6, v5
+; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v16
+; CI-NEXT:    v_or_b32_e32 v3, v2, v3
+; CI-NEXT:    v_or_b32_e32 v2, v0, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v15
+; CI-NEXT:    v_or_b32_e32 v4, v4, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v12
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_or_b32_e32 v13, v1, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; CI-NEXT:    v_or_b32_e32 v12, v7, v0
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v8
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; CI-NEXT:    v_or_b32_e32 v11, v1, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; CI-NEXT:    v_or_b32_e32 v10, v7, v0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[4:7], 0
+; CI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: void_func_v16f16:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX89-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store <16 x half> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; Make sure there is no alignment requirement for passed vgprs.
-; GCN-LABEL: {{^}}void_func_i32_i64_i32:
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0, off
-; GCN: buffer_store_dwordx2 v[1:2]
-; GCN: buffer_store_dword v3
 define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
+; CIGFX89-LABEL: void_func_i32_i64_i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_dword v3, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_i32_i64_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b64 v[1:2], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile i32 %arg0, ptr addrspace(1) undef
   store volatile i64 %arg1, ptr addrspace(1) undef
   store volatile i32 %arg2, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_struct_i32:
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0, off
 define void @void_func_struct_i32({ i32 } %arg0) #0 {
+; CIGFX89-LABEL: void_func_struct_i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_struct_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store { i32 } %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_struct_i8_i32:
-; GCN-DAG: buffer_store_byte v0, off
-; GCN-DAG: buffer_store_dword v1, off
 define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
+; CIGFX89-LABEL: void_func_struct_i8_i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_dword v1, off, s[4:7], 0
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b32 v1, off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store { i8, i32 } %arg0, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
-; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_store_dword v[[ELT1]]
-; GCN-DAG: buffer_store_byte v[[ELT0]]
 define void @void_func_byval_struct_i8_i32(ptr addrspace(5) byval({ i8, i32 }) %arg0) #0 {
+; CIGFX89-LABEL: void_func_byval_struct_i8_i32:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
+; CIGFX89-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    s_waitcnt vmcnt(1)
+; CIGFX89-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(1)
+; CIGFX89-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u8 v1, off, s32
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load { i8, i32 }, ptr addrspace(5) %arg0
   store { i8, i32 } %arg0.load, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
-; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s32 glc{{$}}
-; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s32 offset:4 glc{{$}}
-; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s32 offset:8 glc{{$}}
-; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s32 offset:12 glc{{$}}
-
-; GCN: ds_write_b32 v0, v0
-; GCN: s_setpc_b64
 define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }) %arg0, ptr addrspace(5) byval({ i8, i32 }) %arg1, i32 %arg2) #0 {
+; CI-LABEL: void_func_byval_struct_i8_i32_x2:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 glc
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_byval_struct_i8_i32_x2:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 glc
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v3, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    ds_write_b32 v0, v0
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_byval_struct_i8_i32_x2:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v1, off, s[0:3], s32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:4 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v3, off, s[0:3], s32 offset:8 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v4, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v3, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_struct_i8_i32_x2:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    scratch_load_u8 v1, off, s32 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 offset:4 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_load_u8 v3, off, s32 offset:8 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_load_b32 v4, off, s32 offset:12 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load volatile { i8, i32 }, ptr addrspace(5) %arg0
   %arg1.load = load volatile { i8, i32 }, ptr addrspace(5) %arg1
   store volatile { i8, i32 } %arg0.load, ptr addrspace(1) undef
@@ -554,13 +2030,37 @@ define void @void_func_byval_struct_i8_i32_x2(ptr addrspace(5) byval({ i8, i32 }
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
-; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
-; GCN-DAG: buffer_store_dwordx2 v[[[ARG1_LOAD0]]:[[ARG1_LOAD1]]], off
 define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, ptr addrspace(5) byval(i64) %arg1) #0 {
+; CIGFX89-LABEL: void_func_byval_i32_byval_i64:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    buffer_load_dword v2, off, s[0:3], s32
+; CIGFX89-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
+; CIGFX89-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    s_waitcnt vmcnt(2)
+; CIGFX89-NEXT:    buffer_store_dword v2, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(1)
+; CIGFX89-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_byval_i32_byval_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32
+; GFX11-NEXT:    scratch_load_b64 v[0:1], off, s32 offset:8
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b32 v2, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.load = load i32, ptr addrspace(5) %arg0
   %arg1.load = load i64, ptr addrspace(5) %arg1
   store i32 %arg0.load, ptr addrspace(1) undef
@@ -568,23 +2068,139 @@ define void @void_func_byval_i32_byval_i64(ptr addrspace(5) byval(i32) %arg0, pt
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32_i32_i64:
-; GCN-DAG: buffer_store_dwordx4 v[0:3], off
-; GCN-DAG: buffer_store_dwordx4 v[4:7], off
-; GCN-DAG: buffer_store_dwordx4 v[8:11], off
-; GCN-DAG: buffer_store_dwordx4 v[12:15], off
-; GCN-DAG: buffer_store_dwordx4 v[16:19], off
-; GCN-DAG: buffer_store_dwordx4 v[20:23], off
-; GCN-DAG: buffer_store_dwordx4 v[24:27], off
-; GCN-DAG: buffer_store_dwordx4 v[28:31], off
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG0_31:[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:8
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:12
-
-; GCN: buffer_store_dword v[[LOAD_ARG1]]
-; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off
 define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
+; CI-LABEL: void_func_v32i32_i32_i64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_i32_i64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_i32_i64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_i32_i64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:8
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b32 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b64 v[32:33], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i32 %arg1, ptr addrspace(1) undef
   store volatile i64 %arg2, ptr addrspace(1) undef
@@ -592,26 +2208,167 @@ define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0
 }
 
 ; FIXME: Different ext load types on CI vs. VI
-; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
-; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
-; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
-
-; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off
-; GCN: buffer_store_byte [[LOAD_ARG2]], off
-; GCN: buffer_store_short [[LOAD_ARG3]], off
-; GFX89: buffer_store_short [[LOAD_ARG4]], off
-
-; CI: buffer_store_short [[CVT_ARG4]], off
 define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
+; CI-LABEL: void_func_v32i32_i1_i8_i16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
+; CI-NEXT:    v_and_b32_e32 v0, 1, v16
+; CI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_i1_i8_i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_and_b32_e32 v0, 1, v20
+; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_i1_i8_i16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ushort v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_ushort v17, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ushort v18, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
+; GFX9-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_i1_i8_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_u16 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u16 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u16 v35, off, s32 offset:16
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    v_and_b32_e32 v16, 1, v32
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v16, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b16 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b16 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile i1 %arg1, ptr addrspace(1) undef
   store volatile i8 %arg2, ptr addrspace(1) undef
@@ -620,138 +2377,1136 @@ define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i1
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; GCN: buffer_store_dwordx2 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]], off
-; GCN: buffer_store_dwordx2 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]], off
 define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
+; CI-LABEL: void_func_v32i32_v2i32_v2f32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_v2i32_v2f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_v2i32_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx2 v[16:17], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx2 v[18:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v2i32_v2f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x4
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b64 v[32:33], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b64 v[34:35], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i32> %arg1, ptr addrspace(1) undef
   store volatile <2 x float> %arg2, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16:
-; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GFX9: buffer_store_dword [[LOAD_ARG1]], off
-; GFX9: buffer_store_short [[LOAD_ARG2]], off
 define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
+; CI-LABEL: void_func_v32i32_v2i16_v2f16:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:4
+; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v20
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v17, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v18, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v16, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_short v19, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_v2i16_v2f16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_v2i16_v2f16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dword v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v2i16_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b32 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b32 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i16> %arg1, ptr addrspace(1) undef
   store volatile <2 x half> %arg2, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
+; CI-LABEL: void_func_v32i32_v2i64_v2f64:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_v2i64_v2f64:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_v2i64_v2f64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v2i64_v2f64:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:20
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <2 x i64> %arg1, ptr addrspace(1) undef
   store volatile <2 x double> %arg2, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
+; CI-LABEL: void_func_v32i32_v4i32_v4f32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_v4i32_v4f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_v4i32_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v4i32_v4f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:20
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <4 x i32> %arg1, ptr addrspace(1) undef
   store volatile <4 x float> %arg2, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
-
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]], off
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]], off
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]], off
-; GCN: buffer_store_dwordx4 v[[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]], off
 define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
+; CI-LABEL: void_func_v32i32_v8i32_v8f32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_v8i32_v8f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_v8i32_v8f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v8i32_v8f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x10
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:36
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b128 v[52:55], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b128 v[48:51], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <8 x i32> %arg1, ptr addrspace(1) undef
   store volatile <8 x float> %arg2, ptr addrspace(1) undef
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s32{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s32 offset:4{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s32 offset:8{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_:[0-9]+]], off, s[0:3], s32 offset:12{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s32 offset:16{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s32 offset:20{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s32 offset:24{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s32 offset:28{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s32 offset:32{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s32 offset:36{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s32 offset:40{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s32 offset:44{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s32 offset:48{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s32 offset:52{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s32 offset:56{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:60{{$}}
-
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s32 offset:64{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s32 offset:68{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s32 offset:72{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s32 offset:76{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s32 offset:80{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s32 offset:84{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s32 offset:88{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s32 offset:92{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s32 offset:96{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s32 offset:100{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s32 offset:104{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s32 offset:108{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s32 offset:112{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s32 offset:116{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s32 offset:120{{$}}
-; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s32 offset:124{{$}}
 define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
+; CI-LABEL: void_func_v32i32_v16i32_v16f32:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96
+; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92
+; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
+; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112
+; CI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108
+; CI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
+; CI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:128
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:120
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:80
+; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:76
+; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_v16i32_v16f32:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96
+; VI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92
+; VI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
+; VI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112
+; VI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108
+; VI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
+; VI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:128
+; VI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
+; VI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:120
+; VI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:80
+; VI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:76
+; VI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72
+; VI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_v16i32_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:108
+; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v16i32_v16f32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x20
+; GFX11-NEXT:    scratch_load_b32 v35, off, s32 offset:80
+; GFX11-NEXT:    scratch_load_b32 v34, off, s32 offset:76
+; GFX11-NEXT:    scratch_load_b32 v33, off, s32 offset:72
+; GFX11-NEXT:    scratch_load_b32 v39, off, s32 offset:96
+; GFX11-NEXT:    scratch_load_b32 v38, off, s32 offset:92
+; GFX11-NEXT:    scratch_load_b32 v37, off, s32 offset:88
+; GFX11-NEXT:    scratch_load_b32 v51, off, s32 offset:112
+; GFX11-NEXT:    scratch_load_b32 v50, off, s32 offset:108
+; GFX11-NEXT:    scratch_load_b32 v49, off, s32 offset:104
+; GFX11-NEXT:    scratch_load_b32 v55, off, s32 offset:128
+; GFX11-NEXT:    scratch_load_b32 v54, off, s32 offset:124
+; GFX11-NEXT:    scratch_load_b32 v53, off, s32 offset:120
+; GFX11-NEXT:    scratch_load_b32 v67, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_b32 v66, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_b32 v65, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_b32 v71, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_b32 v70, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v69, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_b32 v83, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_b32 v82, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_b32 v81, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_b32 v87, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_b32 v86, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_b32 v85, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_b32 v84, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_b32 v80, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_b32 v68, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_b32 v64, off, s32 offset:4
+; GFX11-NEXT:    scratch_load_b32 v52, off, s32 offset:116
+; GFX11-NEXT:    scratch_load_b32 v48, off, s32 offset:100
+; GFX11-NEXT:    scratch_load_b32 v36, off, s32 offset:84
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32 offset:68
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    buffer_store_b128 v[84:87], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    buffer_store_b128 v[80:83], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    buffer_store_b128 v[68:71], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b128 v[64:67], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b128 v[52:55], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b128 v[48:51], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b128 v[36:39], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b128 v[32:35], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <16 x i32> %arg1, ptr addrspace(1) undef
   store volatile <16 x float> %arg2, ptr addrspace(1) undef
@@ -759,15 +3514,49 @@ define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1,
 }
 
 ; Make sure v3 isn't a wasted register because of v3 types being promoted to v4
-; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg:
-; GCN: s_waitcnt
-; GCN: ds_write_b32 v{{[0-9]+}}, v0
-; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
-; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
-; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
+; CI-LABEL: void_func_v3f32_wasted_reg:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    ds_write_b32 v0, v1
+; CI-NEXT:    ds_write_b32 v0, v2
+; CI-NEXT:    ds_write_b32 v0, v3
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v3f32_wasted_reg:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_write_b32 v0, v0
+; VI-NEXT:    ds_write_b32 v0, v1
+; VI-NEXT:    ds_write_b32 v0, v2
+; VI-NEXT:    ds_write_b32 v0, v3
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v3f32_wasted_reg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v0, v0
+; GFX9-NEXT:    ds_write_b32 v0, v1
+; GFX9-NEXT:    ds_write_b32 v0, v2
+; GFX9-NEXT:    ds_write_b32 v0, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3f32_wasted_reg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    ds_store_b32 v0, v2
+; GFX11-NEXT:    ds_store_b32 v0, v3
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.0 = extractelement <3 x float> %arg0, i32 0
   %arg0.1 = extractelement <3 x float> %arg0, i32 1
   %arg0.2 = extractelement <3 x float> %arg0, i32 2
@@ -778,15 +3567,49 @@ define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
   ret void
 }
 
-; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg:
-; GCN: s_waitcnt
-; GCN: ds_write_b32 v{{[0-9]+}}, v0
-; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
-; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
-; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
+; CI-LABEL: void_func_v3i32_wasted_reg:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    ds_write_b32 v0, v1
+; CI-NEXT:    ds_write_b32 v0, v2
+; CI-NEXT:    ds_write_b32 v0, v3
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v3i32_wasted_reg:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    ds_write_b32 v0, v0
+; VI-NEXT:    ds_write_b32 v0, v1
+; VI-NEXT:    ds_write_b32 v0, v2
+; VI-NEXT:    ds_write_b32 v0, v3
+; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v3i32_wasted_reg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_write_b32 v0, v0
+; GFX9-NEXT:    ds_write_b32 v0, v1
+; GFX9-NEXT:    ds_write_b32 v0, v2
+; GFX9-NEXT:    ds_write_b32 v0, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v3i32_wasted_reg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    ds_store_b32 v0, v2
+; GFX11-NEXT:    ds_store_b32 v0, v3
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.0 = extractelement <3 x i32> %arg0, i32 0
   %arg0.1 = extractelement <3 x i32> %arg0, i32 1
   %arg0.2 = extractelement <3 x i32> %arg0, i32 2
@@ -798,15 +3621,404 @@ define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
 }
 
 ; Check there is no crash.
-; GCN-LABEL: {{^}}void_func_v16i8:
 define void @void_func_v16i8(<16 x i8> %arg0) #0 {
+; CIGFX89-LABEL: void_func_v16i8:
+; CIGFX89:       ; %bb.0:
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT:    s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT:    s_mov_b32 s6, -1
+; CIGFX89-NEXT:    buffer_store_byte v15, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v14, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v13, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v12, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v11, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v10, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v9, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v8, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v7, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v6, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v5, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v4, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v3, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v2, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v1, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    buffer_store_byte v0, off, s[4:7], 0
+; CIGFX89-NEXT:    s_waitcnt vmcnt(0)
+; CIGFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v16i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_store_b8 v15, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v14, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v13, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v12, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v11, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v10, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v9, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v8, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v7, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v6, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v5, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v4, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v3, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v2, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v1, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <16 x i8> %arg0, ptr addrspace(1) undef
   ret void
 }
 
 ; Check there is no crash.
-; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
 define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
+; CI-LABEL: void_func_v32i32_v16i8:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:64
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:48
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:52
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:36
+; CI-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:40
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:44
+; CI-NEXT:    buffer_load_dword v15, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:24
+; CI-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:16
+; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:12
+; CI-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8
+; CI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:4
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:60
+; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v14, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v13, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v12, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v8, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v15, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v10, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v9, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v11, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v4, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v5, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    buffer_store_byte v6, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: void_func_v32i32_v16i8:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:64
+; VI-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:48
+; VI-NEXT:    buffer_load_ubyte v18, off, s[0:3], s32 offset:52
+; VI-NEXT:    buffer_load_ubyte v19, off, s[0:3], s32 offset:56
+; VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v12, off, s[0:3], s32 offset:36
+; VI-NEXT:    buffer_load_ubyte v13, off, s[0:3], s32 offset:40
+; VI-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32 offset:44
+; VI-NEXT:    buffer_load_ubyte v15, off, s[0:3], s32 offset:28
+; VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v8, off, s[0:3], s32 offset:32
+; VI-NEXT:    buffer_load_ubyte v9, off, s[0:3], s32 offset:20
+; VI-NEXT:    buffer_load_ubyte v10, off, s[0:3], s32 offset:24
+; VI-NEXT:    buffer_load_ubyte v11, off, s[0:3], s32 offset:16
+; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32 offset:12
+; VI-NEXT:    buffer_load_ubyte v5, off, s[0:3], s32 offset:8
+; VI-NEXT:    buffer_load_ubyte v6, off, s[0:3], s32 offset:4
+; VI-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:60
+; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v20, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v19, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v14, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v13, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v12, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v8, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v15, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v10, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v9, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v11, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v4, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v5, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    buffer_store_byte v6, off, s[4:7], 0
+; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_v32i32_v16i8:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[28:31], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[24:27], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[20:23], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_dwordx4 v[16:19], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v16, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_ubyte v17, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_load_ubyte v18, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_load_ubyte v19, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_load_ubyte v20, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[12:15], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v12, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_load_ubyte v13, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_load_ubyte v14, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_load_ubyte v15, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[8:11], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v8, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_load_ubyte v9, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_load_ubyte v10, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_load_ubyte v11, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_load_ubyte v4, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_load_ubyte v5, off, s[0:3], s32 offset:8
+; GFX9-NEXT:    buffer_load_ubyte v6, off, s[0:3], s32 offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v16, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v20, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v19, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v18, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v17, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v14, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v13, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v12, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v8, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v15, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v10, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v9, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v11, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v4, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v5, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    buffer_store_byte v6, off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_v32i32_v16i8:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_clause 0x10
+; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_u8 v32, off, s32 offset:64
+; GFX11-NEXT:    scratch_load_u8 v33, off, s32 offset:60
+; GFX11-NEXT:    scratch_load_u8 v34, off, s32 offset:56
+; GFX11-NEXT:    scratch_load_u8 v35, off, s32 offset:52
+; GFX11-NEXT:    scratch_load_u8 v36, off, s32 offset:48
+; GFX11-NEXT:    scratch_load_u8 v37, off, s32 offset:44
+; GFX11-NEXT:    scratch_load_u8 v38, off, s32 offset:40
+; GFX11-NEXT:    scratch_load_u8 v39, off, s32 offset:36
+; GFX11-NEXT:    scratch_load_u8 v48, off, s32 offset:32
+; GFX11-NEXT:    scratch_load_u8 v49, off, s32 offset:28
+; GFX11-NEXT:    scratch_load_u8 v50, off, s32 offset:24
+; GFX11-NEXT:    scratch_load_u8 v51, off, s32 offset:20
+; GFX11-NEXT:    scratch_load_u8 v52, off, s32 offset:16
+; GFX11-NEXT:    scratch_load_u8 v53, off, s32 offset:12
+; GFX11-NEXT:    scratch_load_u8 v54, off, s32 offset:8
+; GFX11-NEXT:    scratch_load_u8 v55, off, s32 offset:4
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt vmcnt(16)
+; GFX11-NEXT:    buffer_store_b128 v[28:31], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[24:27], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[20:23], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[16:19], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[12:15], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(15)
+; GFX11-NEXT:    buffer_store_b8 v32, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(14)
+; GFX11-NEXT:    buffer_store_b8 v33, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(13)
+; GFX11-NEXT:    buffer_store_b8 v34, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(12)
+; GFX11-NEXT:    buffer_store_b8 v35, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(11)
+; GFX11-NEXT:    buffer_store_b8 v36, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(10)
+; GFX11-NEXT:    buffer_store_b8 v37, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(9)
+; GFX11-NEXT:    buffer_store_b8 v38, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    buffer_store_b8 v39, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    buffer_store_b8 v48, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    buffer_store_b8 v49, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    buffer_store_b8 v50, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    buffer_store_b8 v51, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    buffer_store_b8 v52, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    buffer_store_b8 v53, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    buffer_store_b8 v54, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b8 v55, off, s[0:3], 0 dlc
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   store volatile <32 x i32> %arg0, ptr addrspace(1) undef
   store volatile <16 x i8> %arg1, ptr addrspace(1) undef
   ret void

diff  --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index 5dc2d127379d1..e82fb7df0c08b 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -1,464 +1,1345 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89 %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9 %s
-
-; GCN-LABEL: {{^}}i1_func_void:
-; GCN: buffer_load_ubyte v0, off
-; GCN-NEXT: s_waitcnt
-; GCN-NEXT: s_setpc_b64
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,CI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX8 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX789,GFX89,GFX9 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+
 define i1 @i1_func_void() #0 {
+; GFX789-LABEL: i1_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
 
 ; FIXME: Missing and?
-; GCN-LABEL: {{^}}i1_zeroext_func_void:
-; GCN: buffer_load_ubyte v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define zeroext i1 @i1_zeroext_func_void() #0 {
+; GFX789-LABEL: i1_zeroext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_zeroext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
 
-; GCN-LABEL: {{^}}i1_signext_func_void:
-; GCN: buffer_load_ubyte v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}}
-; GCN-NEXT: s_setpc_b64
 define signext i1 @i1_signext_func_void() #0 {
+; GFX789-LABEL: i1_signext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i1_signext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i1, ptr addrspace(1) undef
   ret i1 %val
 }
 
-; GCN-LABEL: {{^}}i8_func_void:
-; GCN: buffer_load_ubyte v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i8 @i8_func_void() #0 {
+; GFX789-LABEL: i8_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i8_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i8, ptr addrspace(1) undef
   ret i8 %val
 }
 
-; GCN-LABEL: {{^}}i8_zeroext_func_void:
-; GCN: buffer_load_ubyte v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define zeroext i8 @i8_zeroext_func_void() #0 {
+; GFX789-LABEL: i8_zeroext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i8_zeroext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i8, ptr addrspace(1) undef
   ret i8 %val
 }
 
-; GCN-LABEL: {{^}}i8_signext_func_void:
-; GCN: buffer_load_sbyte v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define signext i8 @i8_signext_func_void() #0 {
+; GFX789-LABEL: i8_signext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_sbyte v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i8_signext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_i8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i8, ptr addrspace(1) undef
   ret i8 %val
 }
 
-; GCN-LABEL: {{^}}i16_func_void:
-; GCN: buffer_load_ushort v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i16 @i16_func_void() #0 {
+; GFX789-LABEL: i16_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) undef
   ret i16 %val
 }
 
-; GCN-LABEL: {{^}}i16_zeroext_func_void:
-; GCN: buffer_load_ushort v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define zeroext i16 @i16_zeroext_func_void() #0 {
+; GFX789-LABEL: i16_zeroext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i16_zeroext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) undef
   ret i16 %val
 }
 
-; GCN-LABEL: {{^}}i16_signext_func_void:
-; GCN: buffer_load_sshort v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define signext i16 @i16_signext_func_void() #0 {
+; GFX789-LABEL: i16_signext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_sshort v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i16_signext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_i16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i16, ptr addrspace(1) undef
   ret i16 %val
 }
 
-; GCN-LABEL: {{^}}i32_func_void:
-; GCN: buffer_load_dword v0, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i32 @i32_func_void() #0 {
+; GFX789-LABEL: i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i32, ptr addrspace(1) undef
   ret i32 %val
 }
 
-; GCN-LABEL: {{^}}i48_func_void:
-; GCN: buffer_load_dword v0, off
-; GCN-NEXT: buffer_load_ushort v1, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i48 @i48_func_void() #0 {
+; GFX789-LABEL: i48_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i48_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_u16 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
 }
 
-; GCN-LABEL: {{^}}i48_zeroext_func_void:
-; GCN: buffer_load_dword v0, off
-; GCN-NEXT: buffer_load_ushort v1, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define zeroext i48 @i48_zeroext_func_void() #0 {
+; GFX789-LABEL: i48_zeroext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_ushort v1, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i48_zeroext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_u16 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
 }
 
-; GCN-LABEL: {{^}}i48_signext_func_void:
-; GCN: buffer_load_dword v0, off
-; GCN-NEXT: buffer_load_sshort v1, off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define signext i48 @i48_signext_func_void() #0 {
+; GFX789-LABEL: i48_signext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_sshort v1, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i48_signext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_i16 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i48, ptr addrspace(1) undef, align 8
   ret i48 %val
 }
 
-; GCN-LABEL: {{^}}i63_func_void:
-; GCN: s_waitcnt
-; GCN-NEXT: s_setpc_b64
 define i63 @i63_func_void(i63 %val) #0 {
+; GFX789-LABEL: i63_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i63_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret i63 %val
 }
 
-; GCN-LABEL: {{^}}i63_zeroext_func_void:
-; GCN: s_waitcnt
-; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v1
-; GCN-NEXT: s_setpc_b64
 define zeroext i63 @i63_zeroext_func_void(i63 %val) #0 {
+; GFX789-LABEL: i63_zeroext_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i63_zeroext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret i63 %val
 }
 
-; GCN-LABEL: {{^}}i63_signext_func_void:
-; GCN: s_waitcnt
-; CI-NEXT:	v_lshl_b64 v[0:1], v[0:1], 1
-; CI-NEXT: v_ashr_i64 v[0:1], v[0:1], 1
-
-; GFX89-NEXT:	v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX89-NEXT: v_ashrrev_i64 v[0:1], 1, v[0:1]
-
-; GCN-NEXT: s_setpc_b64
 define signext i63 @i63_signext_func_void(i63 %val) #0 {
+; CI-LABEL: i63_signext_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshl_b64 v[0:1], v[0:1], 1
+; CI-NEXT:    v_ashr_i64 v[0:1], v[0:1], 1
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: i63_signext_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX89-NEXT:    v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i63_signext_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_ashrrev_i64 v[0:1], 1, v[0:1]
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   ret i63 %val
 }
 
-; GCN-LABEL: {{^}}i64_func_void:
-; GCN: buffer_load_dwordx2 v[0:1], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i64 @i64_func_void() #0 {
+; GFX789-LABEL: i64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i64, ptr addrspace(1) undef
   ret i64 %val
 }
 
-; GCN-LABEL: {{^}}i65_func_void:
-; GCN-DAG: buffer_load_dwordx2 v[0:1], off
-; GCN-DAG: buffer_load_ubyte v2, off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define i65 @i65_func_void() #0 {
+; GFX789-LABEL: i65_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_ubyte v2, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: i65_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_u8 v2, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load i65, ptr addrspace(1) undef
   ret i65 %val
 }
 
-; GCN-LABEL: {{^}}f32_func_void:
-; GCN: buffer_load_dword v0, off, s[4:7], 0
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define float @f32_func_void() #0 {
+; GFX789-LABEL: f32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: f32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load float, ptr addrspace(1) undef
   ret float %val
 }
 
-; GCN-LABEL: {{^}}f64_func_void:
-; GCN: buffer_load_dwordx2 v[0:1], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define double @f64_func_void() #0 {
+; GFX789-LABEL: f64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: f64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load double, ptr addrspace(1) undef
   ret double %val
 }
 
-; GCN-LABEL: {{^}}v2f64_func_void:
-; GCN: buffer_load_dwordx4 v[0:3], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <2 x double> @v2f64_func_void() #0 {
+; GFX789-LABEL: v2f64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2f64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x double>, ptr addrspace(1) undef
   ret <2 x double> %val
 }
 
-; GCN-LABEL: {{^}}v2i32_func_void:
-; GCN: buffer_load_dwordx2 v[0:1], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <2 x i32> @v2i32_func_void() #0 {
+; GFX789-LABEL: v2i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i32>, ptr addrspace(1) undef
   ret <2 x i32> %val
 }
 
-; GCN-LABEL: {{^}}v3i32_func_void:
-; GCN: buffer_load_dwordx3 v[0:2], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <3 x i32> @v3i32_func_void() #0 {
+; GFX789-LABEL: v3i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx3 v[0:2], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v3i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b96 v[0:2], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x i32>, ptr addrspace(1) undef
   ret <3 x i32> %val
 }
 
-; GCN-LABEL: {{^}}v4i32_func_void:
-; GCN: buffer_load_dwordx4 v[0:3], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <4 x i32> @v4i32_func_void() #0 {
+; GFX789-LABEL: v4i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i32>, ptr addrspace(1) undef
   ret <4 x i32> %val
 }
 
-; GCN-LABEL: {{^}}v5i32_func_void:
-; GCN-DAG: buffer_load_dword v4, off
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <5 x i32> @v5i32_func_void() #0 {
+; GFX789-LABEL: v5i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dword v4, off, s[4:7], 0 glc
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v5i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b32 v4, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load volatile <5 x i32>, ptr addrspace(1) undef
   ret <5 x i32> %val
 }
 
-; GCN-LABEL: {{^}}v8i32_func_void:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <8 x i32> @v8i32_func_void() #0 {
+; GFX789-LABEL: v8i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v8i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i32>, ptr addrspace(1) %ptr
   ret <8 x i32> %val
 }
 
-; GCN-LABEL: {{^}}v16i32_func_void:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
-; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <16 x i32> @v16i32_func_void() #0 {
+; GFX789-LABEL: v16i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GFX789-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v16i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i32>, ptr addrspace(1) %ptr
   ret <16 x i32> %val
 }
 
-; GCN-LABEL: {{^}}v32i32_func_void:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
-; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN-DAG: buffer_load_dwordx4 v[16:19], off
-; GCN-DAG: buffer_load_dwordx4 v[20:23], off
-; GCN-DAG: buffer_load_dwordx4 v[24:27], off
-; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <32 x i32> @v32i32_func_void() #0 {
+; GFX789-LABEL: v32i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GFX789-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; GFX789-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; GFX789-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; GFX789-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; GFX789-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v32i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96
+; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <32 x i32>, ptr addrspace(1) %ptr
   ret <32 x i32> %val
 }
 
-; GCN-LABEL: {{^}}v2i64_func_void:
-; GCN: buffer_load_dwordx4 v[0:3], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <2 x i64> @v2i64_func_void() #0 {
+; GFX789-LABEL: v2i64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2i64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i64>, ptr addrspace(1) undef
   ret <2 x i64> %val
 }
 
-; GCN-LABEL: {{^}}v3i64_func_void:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx2 v[4:5], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <3 x i64> @v3i64_func_void() #0 {
+; GFX789-LABEL: v3i64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v3i64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b64 v[4:5], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <3 x i64>, ptr addrspace(1) %ptr
   ret <3 x i64> %val
 }
 
-; GCN-LABEL: {{^}}v4i64_func_void:
-; GCN: buffer_load_dwordx4 v[0:3], off
-; GCN: buffer_load_dwordx4 v[4:7], off
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <4 x i64> @v4i64_func_void() #0 {
+; GFX789-LABEL: v4i64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4i64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <4 x i64>, ptr addrspace(1) %ptr
   ret <4 x i64> %val
 }
 
-; GCN-LABEL: {{^}}v5i64_func_void:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx2 v[8:9], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <5 x i64> @v5i64_func_void() #0 {
+; GFX789-LABEL: v5i64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    buffer_load_dwordx2 v[8:9], off, s[4:7], 0 offset:32
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v5i64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b64 v[8:9], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <5 x i64>, ptr addrspace(1) %ptr
   ret <5 x i64> %val
 }
 
-; GCN-LABEL: {{^}}v8i64_func_void:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
-; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <8 x i64> @v8i64_func_void() #0 {
+; GFX789-LABEL: v8i64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GFX789-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v8i64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x3
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i64>, ptr addrspace(1) %ptr
   ret <8 x i64> %val
 }
 
-; GCN-LABEL: {{^}}v16i64_func_void:
-; GCN-DAG: buffer_load_dwordx4 v[0:3], off
-; GCN-DAG: buffer_load_dwordx4 v[4:7], off
-; GCN-DAG: buffer_load_dwordx4 v[8:11], off
-; GCN-DAG: buffer_load_dwordx4 v[12:15], off
-; GCN-DAG: buffer_load_dwordx4 v[16:19], off
-; GCN-DAG: buffer_load_dwordx4 v[20:23], off
-; GCN-DAG: buffer_load_dwordx4 v[24:27], off
-; GCN-DAG: buffer_load_dwordx4 v[28:31], off
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define <16 x i64> @v16i64_func_void() #0 {
+; GFX789-LABEL: v16i64_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX789-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; GFX789-NEXT:    buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; GFX789-NEXT:    buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; GFX789-NEXT:    buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; GFX789-NEXT:    buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; GFX789-NEXT:    buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v16i64_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x7
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[8:11], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[12:15], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[16:19], off, s[0:3], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[20:23], off, s[0:3], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[24:27], off, s[0:3], 0 offset:96
+; GFX11-NEXT:    buffer_load_b128 v[28:31], off, s[0:3], 0 offset:112
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i64>, ptr addrspace(1) %ptr
   ret <16 x i64> %val
 }
 
-; GCN-LABEL: {{^}}v2i16_func_void:
-; GFX9: buffer_load_dword v0, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define <2 x i16> @v2i16_func_void() #0 {
+; CI-LABEL: v2i16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v2i16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v2i16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <2 x i16>, ptr addrspace(1) undef
   ret <2 x i16> %val
 }
 
-; GCN-LABEL: {{^}}v3i16_func_void:
-; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define <3 x i16> @v3i16_func_void() #0 {
+; CI-LABEL: v3i16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_alignbit_b32 v1, v3, v2, 16
+; CI-NEXT:    v_mov_b32_e32 v0, v2
+; CI-NEXT:    v_mov_b32_e32 v2, v3
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v3i16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v3i16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <3 x i16>, ptr addrspace(1) undef
   ret <3 x i16> %val
 }
 
-; GCN-LABEL: {{^}}v4i16_func_void:
-; GFX9: buffer_load_dwordx2 v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define <4 x i16> @v4i16_func_void() #0 {
+; CI-LABEL: v4i16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; CI-NEXT:    v_mov_b32_e32 v2, v1
+; CI-NEXT:    v_mov_b32_e32 v1, v4
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v4i16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4i16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x i16>, ptr addrspace(1) undef
   ret <4 x i16> %val
 }
 
-; GCN-LABEL: {{^}}v4f16_func_void:
-; GFX9: buffer_load_dwordx2 v[0:1], off
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define <4 x half> @v4f16_func_void() #0 {
+; CI-LABEL: v4f16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v4f16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4f16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load <4 x half>, ptr addrspace(1) undef
   ret <4 x half> %val
 }
 
 ; FIXME: Mixing buffer and global
 ; FIXME: Should not scalarize
-; GCN-LABEL: {{^}}v5i16_func_void:
-; GFX9: buffer_load_dwordx4 v[0:3]
-; GFX9-NEXT: s_waitcnt
-; GFX9-NEXT: s_setpc_b64
 define <5 x i16> @v5i16_func_void() #0 {
+; CI-LABEL: v5i16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
+; CI-NEXT:    buffer_load_sshort v4, off, s[4:7], 0 offset:8
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_alignbit_b32 v5, v1, v0, 16
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; CI-NEXT:    v_mov_b32_e32 v2, v1
+; CI-NEXT:    v_mov_b32_e32 v1, v5
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v5i16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v5i16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <5 x i16>, ptr addrspace(1) %ptr
   ret <5 x i16> %val
 }
 
-; GCN-LABEL: {{^}}v8i16_func_void:
-; GFX9-DAG: buffer_load_dwordx4 v[0:3], off
-; GFX9: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define <8 x i16> @v8i16_func_void() #0 {
+; CI-LABEL: v8i16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
+; CI-NEXT:    v_mov_b32_e32 v0, v8
+; CI-NEXT:    v_mov_b32_e32 v2, v9
+; CI-NEXT:    v_mov_b32_e32 v4, v10
+; CI-NEXT:    v_mov_b32_e32 v6, v11
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v8i16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v8i16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <8 x i16>, ptr addrspace(1) %ptr
   ret <8 x i16> %val
 }
 
-; GCN-LABEL: {{^}}v16i16_func_void:
-; GFX9: buffer_load_dwordx4 v[0:3], off
-; GFX9: buffer_load_dwordx4 v[4:7], off
-; GFX9: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define <16 x i16> @v16i16_func_void() #0 {
+; CI-LABEL: v16i16_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dwordx4 v[22:25], off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx4 v[18:21], off, s[4:7], 0 offset:16
+; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v22
+; CI-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
+; CI-NEXT:    v_lshrrev_b32_e32 v5, 16, v24
+; CI-NEXT:    v_lshrrev_b32_e32 v7, 16, v25
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v9, 16, v18
+; CI-NEXT:    v_lshrrev_b32_e32 v11, 16, v19
+; CI-NEXT:    v_lshrrev_b32_e32 v13, 16, v20
+; CI-NEXT:    v_lshrrev_b32_e32 v15, 16, v21
+; CI-NEXT:    v_mov_b32_e32 v0, v22
+; CI-NEXT:    v_mov_b32_e32 v2, v23
+; CI-NEXT:    v_mov_b32_e32 v4, v24
+; CI-NEXT:    v_mov_b32_e32 v6, v25
+; CI-NEXT:    v_mov_b32_e32 v8, v18
+; CI-NEXT:    v_mov_b32_e32 v10, v19
+; CI-NEXT:    v_mov_b32_e32 v12, v20
+; CI-NEXT:    v_mov_b32_e32 v14, v21
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v16i16_func_void:
+; GFX89:       ; %bb.0:
+; GFX89-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX89-NEXT:    s_mov_b32 s7, 0xf000
+; GFX89-NEXT:    s_mov_b32 s6, -1
+; GFX89-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX89-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX89-NEXT:    buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; GFX89-NEXT:    s_waitcnt vmcnt(0)
+; GFX89-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v16i16_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b128 v[4:7], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i16>, ptr addrspace(1) %ptr
   ret <16 x i16> %val
 }
 
 ; FIXME: Should pack
-; GCN-LABEL: {{^}}v16i8_func_void:
-; GCN-DAG: v12
-; GCN-DAG: v13
-; GCN-DAG: v14
-; GCN-DAG: v15
 define <16 x i8> @v16i8_func_void() #0 {
+; GFX789-LABEL: v16i8_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX789-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX789-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX789-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX789-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX789-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX789-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX789-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX789-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX789-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX789-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX789-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX789-NEXT:    v_mov_b32_e32 v4, v1
+; GFX789-NEXT:    v_mov_b32_e32 v8, v2
+; GFX789-NEXT:    v_mov_b32_e32 v12, v3
+; GFX789-NEXT:    v_mov_b32_e32 v1, v16
+; GFX789-NEXT:    v_mov_b32_e32 v2, v17
+; GFX789-NEXT:    v_mov_b32_e32 v3, v18
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v16i8_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v16, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v18, 24, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 8, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v15, 24, v3
+; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v16
+; GFX11-NEXT:    v_mov_b32_e32 v8, v2
+; GFX11-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v3, v18
+; GFX11-NEXT:    v_mov_b32_e32 v2, v17
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <16 x i8>, ptr addrspace(1) %ptr
   ret <16 x i8> %val
 }
 
 ; FIXME: Should pack
-; GCN-LABEL: {{^}}v4i8_func_void:
-; GCN: buffer_load_dword v0
-; GCN-DAG: v_lshrrev_b32_e32 v1, 8, v0
-; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0
-; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
-; GCN: s_setpc_b64
 define <4  x i8> @v4i8_func_void() #0 {
+; GFX789-LABEL: v4i8_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX789-NEXT:    buffer_load_dword v0, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX789-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX789-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v4i8_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <4  x i8>, ptr addrspace(1) %ptr
   ret <4  x i8> %val
 }
 
-; GCN-LABEL: {{^}}struct_i8_i32_func_void:
-; GCN-DAG: buffer_load_dword v1
-; GCN-DAG: buffer_load_ubyte v0
-; GCN: s_waitcnt vmcnt(0)
-; GCN-NEXT: s_setpc_b64
 define {i8, i32} @struct_i8_i32_func_void() #0 {
+; GFX789-LABEL: struct_i8_i32_func_void:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ubyte v0, off, s[4:7], 0
+; GFX789-NEXT:    buffer_load_dword v1, off, s[4:7], 0
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_i8_i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_load_u8 v0, off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b32 v1, off, s[0:3], 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val = load { i8, i32 }, ptr addrspace(1) undef
   ret { i8, i32 } %val
 }
 
-; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32:
-; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]]
-; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
-; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], 0 offen{{$}}
-; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], 0 offen offset:4{{$}}
 define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %arg0) #0 {
+; GFX789-LABEL: void_func_sret_struct_i8_i32:
+; GFX789:       ; %bb.0:
+; GFX789-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT:    s_mov_b32 s7, 0xf000
+; GFX789-NEXT:    s_mov_b32 s6, -1
+; GFX789-NEXT:    buffer_load_ubyte v1, off, s[4:7], 0 glc
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    buffer_load_dword v2, off, s[4:7], 0 glc
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    buffer_store_byte v1, v0, s[0:3], 0 offen
+; GFX789-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:4
+; GFX789-NEXT:    s_waitcnt vmcnt(0)
+; GFX789-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_sret_struct_i8_i32:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    buffer_load_u8 v1, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_load_b32 v2, off, s[0:3], 0 glc dlc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    scratch_store_b8 v0, v1, off
+; GFX11-NEXT:    scratch_store_b32 v0, v2, off offset:4
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %val0 = load volatile i8, ptr addrspace(1) undef
   %val1 = load volatile i32, ptr addrspace(1) undef
   %gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %arg0, i32 0, i32 0
@@ -471,140 +1352,939 @@ define void @void_func_sret_struct_i8_i32(ptr addrspace(5) sret({ i8, i32 }) %ar
 ; FIXME: Should be able to fold offsets in all of these pre-gfx9. Call
 ; lowering introduces an extra CopyToReg/CopyFromReg obscuring the
 ; AssertZext inserted. Not using it introduces the spills.
-
-; GCN-LABEL: {{^}}v33i32_func_void:
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
-; GFX9: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define <33 x i32> @v33i32_func_void() #0 {
+; CI-LABEL: v33i32_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_add_i32_e32 v34, vcc, 0x80, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; CI-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
+; CI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
+; CI-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
+; CI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; CI-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; CI-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
+; CI-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
+; CI-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v33, v34, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v33, vcc, 0x7c, v0
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v4, v33, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0x78, v0
+; CI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
+; CI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; CI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
+; CI-NEXT:    s_waitcnt vmcnt(11)
+; CI-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; CI-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; CI-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; CI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 0x4c, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 0x48, v0
+; CI-NEXT:    v_add_i32_e32 v7, vcc, 0x44, v0
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; CI-NEXT:    buffer_store_dword v9, v4, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 48, v0
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
+; CI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v15, v6, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v20, v3, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v18, v4, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v23, v10, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v22, v11, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; CI-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; CI-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; CI-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; CI-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; CI-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; CI-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; CI-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v33i32_func_void:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    v_add_u32_e32 v34, vcc, 0x80, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; GFX8-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
+; GFX8-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
+; GFX8-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
+; GFX8-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; GFX8-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; GFX8-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
+; GFX8-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
+; GFX8-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
+; GFX8-NEXT:    buffer_store_dword v33, v34, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 0x7c, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
+; GFX8-NEXT:    buffer_store_dword v4, v33, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x78, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x74, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x6c, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x64, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(11)
+; GFX8-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x60, v0
+; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
+; GFX8-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x58, v0
+; GFX8-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x54, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x50, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x4c, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 56, v0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x48, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x44, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 60, v0
+; GFX8-NEXT:    buffer_store_dword v9, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 52, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 48, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 44, v0
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 40, v0
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 36, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v15, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v20, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v18, v4, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v23, v10, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v22, v11, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v0
+; GFX8-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 28, v0
+; GFX8-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 20, v0
+; GFX8-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 16, v0
+; GFX8-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 12, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 8, v0
+; GFX8-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 4, v0
+; GFX8-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v33i32_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
+; GFX9-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
+; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
+; GFX9-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; GFX9-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; GFX9-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; GFX9-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
+; GFX9-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    s_waitcnt vmcnt(20)
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(26)
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v33i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:112
+; GFX11-NEXT:    buffer_load_b128 v[5:8], off, s[0:3], 0 offset:96
+; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s4, s0, 64
+; GFX11-NEXT:    s_add_i32 s5, s0, 48
+; GFX11-NEXT:    s_add_i32 s6, s0, 32
+; GFX11-NEXT:    s_add_i32 s7, s0, 16
+; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load <33 x i32>, ptr addrspace(1) %ptr
   ret <33 x i32> %val
 }
 
-; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
-; GFX9: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
+; CI-LABEL: struct_v32i32_i32_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    v_add_i32_e32 v34, vcc, 0x80, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; CI-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
+; CI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
+; CI-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
+; CI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; CI-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; CI-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
+; CI-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
+; CI-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v33, v34, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v33, vcc, 0x7c, v0
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v4, v33, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0x78, v0
+; CI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0x74, v0
+; CI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x70, v0
+; CI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x6c, v0
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x68, v0
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0x64, v0
+; CI-NEXT:    s_waitcnt vmcnt(11)
+; CI-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x60, v0
+; CI-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0x5c, v0
+; CI-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0x58, v0
+; CI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x54, v0
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0x50, v0
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 0x4c, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 56, v0
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 0x48, v0
+; CI-NEXT:    v_add_i32_e32 v7, vcc, 0x44, v0
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 64, v0
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 60, v0
+; CI-NEXT:    buffer_store_dword v9, v4, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 52, v0
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 48, v0
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 44, v0
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 40, v0
+; CI-NEXT:    v_add_i32_e32 v11, vcc, 36, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v15, v6, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v20, v3, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v18, v4, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v23, v10, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v22, v11, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 32, v0
+; CI-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 28, v0
+; CI-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 24, v0
+; CI-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 20, v0
+; CI-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 16, v0
+; CI-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 12, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; CI-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 4, v0
+; CI-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: struct_v32i32_i32_func_void:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    v_add_u32_e32 v34, vcc, 0x80, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; GFX8-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
+; GFX8-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
+; GFX8-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
+; GFX8-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; GFX8-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; GFX8-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
+; GFX8-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
+; GFX8-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
+; GFX8-NEXT:    buffer_store_dword v33, v34, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 0x7c, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
+; GFX8-NEXT:    buffer_store_dword v4, v33, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x78, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x74, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x70, v0
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x6c, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x68, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x64, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(11)
+; GFX8-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x60, v0
+; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x5c, v0
+; GFX8-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x58, v0
+; GFX8-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x54, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x50, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x4c, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 56, v0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x48, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x44, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 64, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 60, v0
+; GFX8-NEXT:    buffer_store_dword v9, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 52, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 48, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 44, v0
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 40, v0
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 36, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v15, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v20, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v18, v4, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v23, v10, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v22, v11, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v0
+; GFX8-NEXT:    buffer_store_dword v21, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 28, v0
+; GFX8-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 24, v0
+; GFX8-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 20, v0
+; GFX8-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 16, v0
+; GFX8-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 12, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 8, v0
+; GFX8-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 4, v0
+; GFX8-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: struct_v32i32_i32_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:112
+; GFX9-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:96
+; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:80
+; GFX9-NEXT:    buffer_load_dword v33, off, s[4:7], 0 offset:128
+; GFX9-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:64
+; GFX9-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:48
+; GFX9-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:32
+; GFX9-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:16
+; GFX9-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:124
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:120
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:116
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:112
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:108
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:104
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:100
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:96
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:92
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:88
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:84
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:80
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:76
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:72
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:68
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:64
+; GFX9-NEXT:    s_waitcnt vmcnt(20)
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:60
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:56
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:52
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:44
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:40
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:36
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(26)
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:28
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:24
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:20
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:16
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:12
+; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:8
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:4
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_v32i32_i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:112
+; GFX11-NEXT:    buffer_load_b128 v[5:8], off, s[0:3], 0 offset:96
+; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:80
+; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:64
+; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:48
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:32
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0
+; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0 offset:128
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_add_i32 s1, s0, 0x70
+; GFX11-NEXT:    s_add_i32 s2, s0, 0x60
+; GFX11-NEXT:    s_add_i32 s3, s0, 0x50
+; GFX11-NEXT:    s_add_i32 s4, s0, 64
+; GFX11-NEXT:    s_add_i32 s5, s0, 48
+; GFX11-NEXT:    s_add_i32 s6, s0, 32
+; GFX11-NEXT:    s_add_i32 s7, s0, 16
+; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b32 off, v33, s8
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { <32 x i32>, i32 }, ptr addrspace(1) %ptr
   ret { <32 x i32>, i32 }%val
 }
 
-; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:132{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:136{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:140{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:144{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:148{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:152{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:156{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:160{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:164{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:168{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:172{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:176{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:180{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:184{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:188{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:192{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:196{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:200{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:204{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:208{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:212{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:216{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:220{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:224{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:228{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:232{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:236{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:240{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}}
-; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}}
-; GFX9: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64
 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
+; CI-LABEL: struct_i32_v32i32_func_void:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CI-NEXT:    s_mov_b32 s7, 0xf000
+; CI-NEXT:    s_mov_b32 s6, -1
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    buffer_load_dword v33, off, s[4:7], 0
+; CI-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240
+; CI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224
+; CI-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208
+; CI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192
+; CI-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176
+; CI-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160
+; CI-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144
+; CI-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v33, vcc, 0xfc, v0
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    buffer_store_dword v4, v33, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xf8, v0
+; CI-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0xf4, v0
+; CI-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0xf0, v0
+; CI-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0xec, v0
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0xe8, v0
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0xe4, v0
+; CI-NEXT:    s_waitcnt vmcnt(11)
+; CI-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0xe0, v0
+; CI-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0xdc, v0
+; CI-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0xd8, v0
+; CI-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0xd4, v0
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xd0, v0
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 0xcc, v0
+; CI-NEXT:    v_add_i32_e32 v6, vcc, 0xc8, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0xb8, v0
+; CI-NEXT:    v_add_i32_e32 v7, vcc, 0xc4, v0
+; CI-NEXT:    v_add_i32_e32 v2, vcc, 0xc0, v0
+; CI-NEXT:    v_add_i32_e32 v3, vcc, 0xbc, v0
+; CI-NEXT:    buffer_store_dword v9, v4, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v4, vcc, 0xb4, v0
+; CI-NEXT:    v_add_i32_e32 v8, vcc, 0xb0, v0
+; CI-NEXT:    v_add_i32_e32 v9, vcc, 0xac, v0
+; CI-NEXT:    v_add_i32_e32 v10, vcc, 0xa8, v0
+; CI-NEXT:    v_add_i32_e32 v11, vcc, 0xa4, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v5, vcc, 0xa0, v0
+; CI-NEXT:    buffer_store_dword v15, v6, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v20, v3, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v18, v4, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v23, v10, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v22, v11, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x9c, v0
+; CI-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x98, v0
+; CI-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x94, v0
+; CI-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x90, v0
+; CI-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x8c, v0
+; CI-NEXT:    s_waitcnt vmcnt(14)
+; CI-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x88, v0
+; CI-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v1, vcc, 0x84, v0
+; CI-NEXT:    v_add_i32_e32 v0, vcc, 0x80, v0
+; CI-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; CI-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: struct_i32_v32i32_func_void:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX8-NEXT:    s_mov_b32 s7, 0xf000
+; GFX8-NEXT:    s_mov_b32 s6, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    buffer_load_dword v33, off, s[4:7], 0
+; GFX8-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240
+; GFX8-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224
+; GFX8-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208
+; GFX8-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192
+; GFX8-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176
+; GFX8-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160
+; GFX8-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144
+; GFX8-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
+; GFX8-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v33, vcc, 0xfc, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(8)
+; GFX8-NEXT:    buffer_store_dword v4, v33, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xf8, v0
+; GFX8-NEXT:    buffer_store_dword v3, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xf4, v0
+; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xf0, v0
+; GFX8-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xec, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xe8, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xe4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(11)
+; GFX8-NEXT:    buffer_store_dword v8, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xe0, v0
+; GFX8-NEXT:    buffer_store_dword v7, v2, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xdc, v0
+; GFX8-NEXT:    buffer_store_dword v6, v3, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xd8, v0
+; GFX8-NEXT:    buffer_store_dword v5, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xd4, v0
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xd0, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0xcc, v0
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0xc8, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v12, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v11, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v10, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0xb8, v0
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xc4, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xc0, v0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xbc, v0
+; GFX8-NEXT:    buffer_store_dword v9, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0xb4, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xb0, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xac, v0
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xa8, v0
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0xa4, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v16, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0xa0, v0
+; GFX8-NEXT:    buffer_store_dword v15, v6, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v14, v7, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v13, v2, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v20, v3, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v19, v1, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v18, v4, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v17, v8, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v24, v9, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v23, v10, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v22, v11, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v21, v5, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x9c, v0
+; GFX8-NEXT:    buffer_store_dword v28, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x98, v0
+; GFX8-NEXT:    buffer_store_dword v27, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x94, v0
+; GFX8-NEXT:    buffer_store_dword v26, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x90, v0
+; GFX8-NEXT:    buffer_store_dword v25, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x8c, v0
+; GFX8-NEXT:    s_waitcnt vmcnt(14)
+; GFX8-NEXT:    buffer_store_dword v32, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x88, v0
+; GFX8-NEXT:    buffer_store_dword v31, v1, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x84, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; GFX8-NEXT:    buffer_store_dword v30, v1, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: struct_i32_v32i32_func_void:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b32 s7, 0xf000
+; GFX9-NEXT:    s_mov_b32 s6, -1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    buffer_load_dwordx4 v[1:4], off, s[4:7], 0 offset:240
+; GFX9-NEXT:    buffer_load_dwordx4 v[5:8], off, s[4:7], 0 offset:224
+; GFX9-NEXT:    buffer_load_dwordx4 v[9:12], off, s[4:7], 0 offset:208
+; GFX9-NEXT:    buffer_load_dword v33, off, s[4:7], 0
+; GFX9-NEXT:    buffer_load_dwordx4 v[13:16], off, s[4:7], 0 offset:192
+; GFX9-NEXT:    buffer_load_dwordx4 v[17:20], off, s[4:7], 0 offset:176
+; GFX9-NEXT:    buffer_load_dwordx4 v[21:24], off, s[4:7], 0 offset:160
+; GFX9-NEXT:    buffer_load_dwordx4 v[25:28], off, s[4:7], 0 offset:144
+; GFX9-NEXT:    buffer_load_dwordx4 v[29:32], off, s[4:7], 0 offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    buffer_store_dword v4, v0, s[0:3], 0 offen offset:252
+; GFX9-NEXT:    buffer_store_dword v3, v0, s[0:3], 0 offen offset:248
+; GFX9-NEXT:    buffer_store_dword v2, v0, s[0:3], 0 offen offset:244
+; GFX9-NEXT:    buffer_store_dword v1, v0, s[0:3], 0 offen offset:240
+; GFX9-NEXT:    s_waitcnt vmcnt(11)
+; GFX9-NEXT:    buffer_store_dword v8, v0, s[0:3], 0 offen offset:236
+; GFX9-NEXT:    buffer_store_dword v7, v0, s[0:3], 0 offen offset:232
+; GFX9-NEXT:    buffer_store_dword v6, v0, s[0:3], 0 offen offset:228
+; GFX9-NEXT:    buffer_store_dword v5, v0, s[0:3], 0 offen offset:224
+; GFX9-NEXT:    s_waitcnt vmcnt(14)
+; GFX9-NEXT:    buffer_store_dword v12, v0, s[0:3], 0 offen offset:220
+; GFX9-NEXT:    buffer_store_dword v11, v0, s[0:3], 0 offen offset:216
+; GFX9-NEXT:    buffer_store_dword v10, v0, s[0:3], 0 offen offset:212
+; GFX9-NEXT:    buffer_store_dword v9, v0, s[0:3], 0 offen offset:208
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_store_dword v33, v0, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(17)
+; GFX9-NEXT:    buffer_store_dword v16, v0, s[0:3], 0 offen offset:204
+; GFX9-NEXT:    buffer_store_dword v15, v0, s[0:3], 0 offen offset:200
+; GFX9-NEXT:    buffer_store_dword v14, v0, s[0:3], 0 offen offset:196
+; GFX9-NEXT:    buffer_store_dword v13, v0, s[0:3], 0 offen offset:192
+; GFX9-NEXT:    s_waitcnt vmcnt(20)
+; GFX9-NEXT:    buffer_store_dword v20, v0, s[0:3], 0 offen offset:188
+; GFX9-NEXT:    buffer_store_dword v19, v0, s[0:3], 0 offen offset:184
+; GFX9-NEXT:    buffer_store_dword v18, v0, s[0:3], 0 offen offset:180
+; GFX9-NEXT:    buffer_store_dword v17, v0, s[0:3], 0 offen offset:176
+; GFX9-NEXT:    s_waitcnt vmcnt(23)
+; GFX9-NEXT:    buffer_store_dword v24, v0, s[0:3], 0 offen offset:172
+; GFX9-NEXT:    buffer_store_dword v23, v0, s[0:3], 0 offen offset:168
+; GFX9-NEXT:    buffer_store_dword v22, v0, s[0:3], 0 offen offset:164
+; GFX9-NEXT:    buffer_store_dword v21, v0, s[0:3], 0 offen offset:160
+; GFX9-NEXT:    s_waitcnt vmcnt(26)
+; GFX9-NEXT:    buffer_store_dword v28, v0, s[0:3], 0 offen offset:156
+; GFX9-NEXT:    buffer_store_dword v27, v0, s[0:3], 0 offen offset:152
+; GFX9-NEXT:    buffer_store_dword v26, v0, s[0:3], 0 offen offset:148
+; GFX9-NEXT:    buffer_store_dword v25, v0, s[0:3], 0 offen offset:144
+; GFX9-NEXT:    s_waitcnt vmcnt(29)
+; GFX9-NEXT:    buffer_store_dword v32, v0, s[0:3], 0 offen offset:140
+; GFX9-NEXT:    buffer_store_dword v31, v0, s[0:3], 0 offen offset:136
+; GFX9-NEXT:    buffer_store_dword v30, v0, s[0:3], 0 offen offset:132
+; GFX9-NEXT:    buffer_store_dword v29, v0, s[0:3], 0 offen offset:128
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: struct_i32_v32i32_func_void:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-NEXT:    s_mov_b32 s2, -1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_clause 0x8
+; GFX11-NEXT:    buffer_load_b128 v[1:4], off, s[0:3], 0 offset:240
+; GFX11-NEXT:    buffer_load_b128 v[5:8], off, s[0:3], 0 offset:224
+; GFX11-NEXT:    buffer_load_b128 v[9:12], off, s[0:3], 0 offset:208
+; GFX11-NEXT:    buffer_load_b128 v[13:16], off, s[0:3], 0 offset:192
+; GFX11-NEXT:    buffer_load_b128 v[17:20], off, s[0:3], 0 offset:176
+; GFX11-NEXT:    buffer_load_b128 v[21:24], off, s[0:3], 0 offset:160
+; GFX11-NEXT:    buffer_load_b128 v[25:28], off, s[0:3], 0 offset:144
+; GFX11-NEXT:    buffer_load_b128 v[29:32], off, s[0:3], 0 offset:128
+; GFX11-NEXT:    buffer_load_b32 v33, off, s[0:3], 0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_add_i32 s1, s0, 0xf0
+; GFX11-NEXT:    s_add_i32 s2, s0, 0xe0
+; GFX11-NEXT:    s_add_i32 s3, s0, 0xd0
+; GFX11-NEXT:    s_add_i32 s4, s0, 0xc0
+; GFX11-NEXT:    s_add_i32 s5, s0, 0xb0
+; GFX11-NEXT:    s_add_i32 s6, s0, 0xa0
+; GFX11-NEXT:    s_add_i32 s7, s0, 0x90
+; GFX11-NEXT:    s_add_i32 s8, s0, 0x80
+; GFX11-NEXT:    s_waitcnt vmcnt(8)
+; GFX11-NEXT:    scratch_store_b128 off, v[1:4], s1
+; GFX11-NEXT:    s_waitcnt vmcnt(7)
+; GFX11-NEXT:    scratch_store_b128 off, v[5:8], s2
+; GFX11-NEXT:    s_waitcnt vmcnt(6)
+; GFX11-NEXT:    scratch_store_b128 off, v[9:12], s3
+; GFX11-NEXT:    s_waitcnt vmcnt(5)
+; GFX11-NEXT:    scratch_store_b128 off, v[13:16], s4
+; GFX11-NEXT:    s_waitcnt vmcnt(4)
+; GFX11-NEXT:    scratch_store_b128 off, v[17:20], s5
+; GFX11-NEXT:    s_waitcnt vmcnt(3)
+; GFX11-NEXT:    scratch_store_b128 off, v[21:24], s6
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    scratch_store_b128 off, v[25:28], s7
+; GFX11-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-NEXT:    scratch_store_b128 off, v[29:32], s8
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    scratch_store_b32 off, v33, s0
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ptr = load volatile ptr addrspace(1), ptr addrspace(4) undef
   %val = load { i32, <32 x i32> }, ptr addrspace(1) %ptr
   ret { i32, <32 x i32> }%val
 }
 
 ; Make sure the last struct component is returned in v3, not v4.
-; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg:
-; GCN: ds_read_b32 v0,
-; GCN: ds_read_b32 v1,
-; GCN: ds_read_b32 v2,
-; GCN: ds_read_b32 v3,
 define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
+; CI-LABEL: v3i32_struct_func_void_wasted_reg:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    ds_read_b32 v1, v0
+; CI-NEXT:    ds_read_b32 v2, v0
+; CI-NEXT:    ds_read_b32 v3, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v3i32_struct_func_void_wasted_reg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_read_b32 v1, v0
+; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v3i32_struct_func_void_wasted_reg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v1, v0
+; GFX9-NEXT:    ds_read_b32 v2, v0
+; GFX9-NEXT:    ds_read_b32 v3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v3i32_struct_func_void_wasted_reg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_load_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b32 v1, v0
+; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    ds_load_b32 v3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load volatile i32, ptr addrspace(3) undef
   %load1 = load volatile i32, ptr addrspace(3) undef
   %load2 = load volatile i32, ptr addrspace(3) undef
@@ -618,12 +2298,53 @@ define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
   ret { <3 x i32>, i32 } %insert.4
 }
 
-; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg:
-; GCN: ds_read_b32 v0,
-; GCN: ds_read_b32 v1,
-; GCN: ds_read_b32 v2,
-; GCN: ds_read_b32 v3,
 define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
+; CI-LABEL: v3f32_struct_func_void_wasted_reg:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_read_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    ds_read_b32 v1, v0
+; CI-NEXT:    ds_read_b32 v2, v0
+; CI-NEXT:    ds_read_b32 v3, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v3f32_struct_func_void_wasted_reg:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_read_b32 v0, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    ds_read_b32 v1, v0
+; GFX8-NEXT:    ds_read_b32 v2, v0
+; GFX8-NEXT:    ds_read_b32 v3, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v3f32_struct_func_void_wasted_reg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    ds_read_b32 v1, v0
+; GFX9-NEXT:    ds_read_b32 v2, v0
+; GFX9-NEXT:    ds_read_b32 v3, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v3f32_struct_func_void_wasted_reg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    ds_load_b32 v0, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    ds_load_b32 v1, v0
+; GFX11-NEXT:    ds_load_b32 v2, v0
+; GFX11-NEXT:    ds_load_b32 v3, v0
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %load0 = load volatile float, ptr addrspace(3) undef
   %load1 = load volatile float, ptr addrspace(3) undef
   %load2 = load volatile float, ptr addrspace(3) undef
@@ -637,14 +2358,54 @@ define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
   ret { <3 x float>, i32 } %insert.4
 }
 
-; GCN-LABEL: {{^}}void_func_sret_max_known_zero_bits:
-; GCN: v_lshrrev_b32_e32 [[LSHR16:v[0-9]+]], 16, v0
-; GCN: ds_write_b32 {{v[0-9]+}}, [[LSHR16]]
-
-; GCN: v_mov_b32_e32 [[HIGH_BITS:v[0-9]+]], 0
-; GCN: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]]
-; GCN-NEXT: ds_write_b32 {{v[0-9]+}}, [[HIGH_BITS]]
 define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0) #0 {
+; CI-LABEL: void_func_sret_max_known_zero_bits:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    ds_write_b32 v0, v0
+; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: void_func_sret_max_known_zero_bits:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    s_mov_b32 m0, -1
+; GFX8-NEXT:    ds_write_b32 v0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    ds_write_b32 v0, v0
+; GFX8-NEXT:    ds_write_b32 v0, v0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: void_func_sret_max_known_zero_bits:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    ds_write_b32 v0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    ds_write_b32 v0, v0
+; GFX9-NEXT:    ds_write_b32 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: void_func_sret_max_known_zero_bits:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 17, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    ds_store_b32 v0, v1
+; GFX11-NEXT:    ds_store_b32 v0, v0
+; GFX11-NEXT:    ds_store_b32 v0, v2
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %arg0.int = ptrtoint ptr addrspace(5) %arg0 to i32
 
   %lshr0 = lshr i32 %arg0.int, 16

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index 1bfe0aa4086e7..7edcc93e55f89 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -1,12 +1,24 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
 
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+; GFX68-LABEL: buffer_store:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX68-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
+; GFX68-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 glc
+; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 slc
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
@@ -14,34 +26,65 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+; GFX68-LABEL: buffer_store_immoffs:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_immoffs:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+; GFX68-LABEL: buffer_store_ofs:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_ofs:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+; VERDE-LABEL: buffer_store_wait:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_waitcnt expcnt(0)
+; VERDE-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
+; VERDE-NEXT:    s_waitcnt vmcnt(0)
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_wait:
+; GFX8:       ; %bb.0: ; %main_body
+; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; GFX8-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_wait:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
+; GFX11-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], 0 offen
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0)
@@ -49,29 +92,52 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %offset) {
+; GFX68-LABEL: buffer_store_x1:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x1:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %offset) #0 {
+; GFX68-LABEL: buffer_store_x2:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x2:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_and:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
 define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: buffer_store_x1_offen_merged_and:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX68-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x1_offen_merged_and:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -87,11 +153,22 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_or:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28
 define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: buffer_store_x1_offen_merged_or:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX68-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x1_offen_merged_or:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %a = shl i32 %inp, 6
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -108,13 +185,22 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i3
   ret void
 }
 
-
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
 define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
+; GFX68-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
+; GFX68-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    buffer_store_b64 v[1:2], v0, s[0:3], 0 offen offset:4
+; GFX11-NEXT:    buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc
+; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -130,10 +216,17 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsr
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
+; GFX68-LABEL: buffer_store_x2_offen_merged_and:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x2_offen_merged_and:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
@@ -141,10 +234,19 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
 define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) {
+; GFX68-LABEL: buffer_store_x2_offen_merged_or:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX68-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x2_offen_merged_or:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %a = shl i32 %inp, 4
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -153,11 +255,20 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i3
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: buffer_store_x1_offset_merged:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x1_offset_merged:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
+; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -167,21 +278,38 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, floa
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
+; GFX68-LABEL: buffer_store_x2_offset_merged:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x2_offset_merged:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_int:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
-;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc
 define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) {
+; GFX68-LABEL: buffer_store_int:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
+; GFX68-NEXT:    buffer_store_dword v6, off, s[0:3], 0 slc
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_int:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0
+; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 glc
+; GFX11-NEXT:    buffer_store_b32 v6, off, s[0:3], 0 slc
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
@@ -189,12 +317,19 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_buffer_store_byte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
+; GFX68-LABEL: raw_buffer_store_byte:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX68-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_byte:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -202,12 +337,19 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_buffer_store_short:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1) {
+; GFX68-LABEL: raw_buffer_store_short:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX68-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_short:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -215,12 +357,17 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_buffer_store_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NOT: v0
-;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) {
+; GFX68-LABEL: raw_buffer_store_f16:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_f16:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   %cast = bitcast i16 %trunc to half
@@ -228,59 +375,142 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_v2f16:
+; GFX8:       ; %bb.0: ; %main_body
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_v2f16:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VERDE-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v2
+; VERDE-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_v4f16:
+; GFX8:       ; %bb.0: ; %main_body
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_v4f16:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_buffer_store_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NOT: v0
-;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) {
+; GFX68-LABEL: raw_buffer_store_i16:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_i16:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_v2i16:
+; GFX8:       ; %bb.0: ; %main_body
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_v2i16:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v2, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_v4i16:
+; GFX8:       ; %bb.0: ; %main_body
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_v4i16:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: raw_buffer_store_x1_offset_merged:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_x1_offset_merged:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
+; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -290,14 +520,28 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc,
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_swizzled_not_merged:
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32
 define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX68-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8
+; GFX68-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:12
+; GFX68-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:16
+; GFX68-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:28
+; GFX68-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:32
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x5
+; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0 offset:4
+; GFX11-NEXT:    buffer_store_b32 v1, off, s[0:3], 0 offset:8
+; GFX11-NEXT:    buffer_store_b32 v2, off, s[0:3], 0 offset:12
+; GFX11-NEXT:    buffer_store_b32 v3, off, s[0:3], 0 offset:16
+; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 offset:28
+; GFX11-NEXT:    buffer_store_b32 v5, off, s[0:3], 0 offset:32
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 4c629e227e1f6..a9839005ca7b6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -1,12 +1,26 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_dwordx4 v[4:7], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_dwordx4 v[8:11], {{v[0-9]+}}, s[0:3], 0 idxen slc
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s
+
 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
+; GFX68-LABEL: buffer_store:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_mov_b32_e32 v12, 0
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v12, s[0:3], 0 idxen
+; GFX68-NEXT:    buffer_store_dwordx4 v[4:7], v12, s[0:3], 0 idxen glc
+; GFX68-NEXT:    buffer_store_dwordx4 v[8:11], v12, s[0:3], 0 idxen slc
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_mov_b32_e32 v12, 0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v12, s[0:3], 0 idxen
+; GFX11-NEXT:    buffer_store_b128 v[4:7], v12, s[0:3], 0 idxen glc
+; GFX11-NEXT:    buffer_store_b128 v[8:11], v12, s[0:3], 0 idxen slc
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -14,62 +28,123 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen offset:42
 define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
+; GFX68-LABEL: buffer_store_immoffs:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_mov_b32_e32 v4, 0
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen offset:42
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_immoffs:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_mov_b32_e32 v4, 0
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen offset:42
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_idx:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
+; GFX68-LABEL: buffer_store_idx:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_idx:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
+; GFX68-LABEL: buffer_store_ofs:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    s_mov_b32 s4, 0
+; GFX68-NEXT:    v_mov_b32_e32 v5, v4
+; GFX68-NEXT:    v_mov_b32_e32 v4, s4
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_ofs:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, s4
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) {
+; GFX68-LABEL: buffer_store_both:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_both:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v[4:5], s[0:3], 0 idxen offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_both_reversed:
-;CHECK: v_mov_b32_e32 v6, v4
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
 define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) {
+; GFX68-LABEL: buffer_store_both_reversed:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_mov_b32_e32 v6, v4
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_both_reversed:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_mov_b32_e32 v6, v4
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v[5:6], s[0:3], 0 idxen offen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i32 0, i32 0)
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) {
+; VERDE-LABEL: buffer_store_wait:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; VERDE-NEXT:    s_waitcnt expcnt(0)
+; VERDE-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+; VERDE-NEXT:    s_waitcnt vmcnt(0)
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: buffer_store_wait:
+; GFX8:       ; %bb.0: ; %main_body
+; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
+; GFX8-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_wait:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 idxen
+; GFX11-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], 0 idxen
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0, i32 0)
@@ -77,30 +152,56 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) {
+; GFX68-LABEL: buffer_store_x1:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x1:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
 define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 {
+; GFX68-LABEL: buffer_store_x2:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_x2:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_int:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], {{v[0-9]+}}, s[0:3], 0 idxen
-;CHECK: buffer_store_dwordx2 v[4:5], {{v[0-9]+}}, s[0:3], 0 idxen glc
-;CHECK: buffer_store_dword v6, {{v[0-9]+}}, s[0:3], 0 idxen slc
 define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i32) {
+; GFX68-LABEL: buffer_store_int:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_mov_b32_e32 v7, 0
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], v7, s[0:3], 0 idxen
+; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], v7, s[0:3], 0 idxen glc
+; GFX68-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 idxen slc
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: buffer_store_int:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_mov_b32_e32 v7, 0
+; GFX11-NEXT:    s_clause 0x2
+; GFX11-NEXT:    buffer_store_b128 v[0:3], v7, s[0:3], 0 idxen
+; GFX11-NEXT:    buffer_store_b64 v[4:5], v7, s[0:3], 0 idxen glc
+; GFX11-NEXT:    buffer_store_b32 v6, v7, s[0:3], 0 idxen slc
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.struct.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.struct.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 0, i32 1)
@@ -108,12 +209,19 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_buffer_store_byte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
+; GFX68-LABEL: struct_buffer_store_byte:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX68-NEXT:    buffer_store_byte v0, v1, s[0:3], 0 idxen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_byte:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    buffer_store_b8 v0, v1, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -121,39 +229,89 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_buffer_store_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_f16_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
+; GFX68-LABEL: struct_buffer_store_f16:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX68-NEXT:    buffer_store_short v0, v1, s[0:3], 0 idxen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   %v2 = fptrunc float %v1 to half
   call void @llvm.amdgcn.struct.buffer.store.f16(half %v2, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_buffer_store_v2f16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %v1, i32 %index) {
+; VERDE-LABEL: struct_buffer_store_v2f16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 idxen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: struct_buffer_store_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_buffer_store_v4f16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %v1, i32 %index) {
+; VERDE-LABEL: struct_buffer_store_v4f16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VERDE-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v2
+; VERDE-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 idxen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: struct_buffer_store_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_buffer_store_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, v{{[0-9]}}, s[0:3], 0 idxen
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1, i32 %index) {
+; GFX68-LABEL: struct_buffer_store_i16:
+; GFX68:       ; %bb.0: ; %main_body
+; GFX68-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX68-NEXT:    buffer_store_short v0, v1, s[0:3], 0 idxen
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_i16:
+; GFX11:       ; %bb.0: ; %main_body
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    buffer_store_b16 v0, v1, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -161,18 +319,51 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_buffer_store_vif16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dword v0, {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16> %v1, i32 %index) {
+; VERDE-LABEL: struct_buffer_store_vif16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 idxen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: struct_buffer_store_vif16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 idxen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_vif16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}struct_buffer_store_v4i16:
-;CHECK-NEXT: %bb.
-;CHECK: buffer_store_dwordx2 v[0:1], {{v[0-9]+}}, s[0:3], 0 idxen
 define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %v1, i32 %index) {
+; VERDE-LABEL: struct_buffer_store_v4i16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v2, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 idxen
+; VERDE-NEXT:    s_endpgm
+;
+; GFX8-LABEL: struct_buffer_store_v4i16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen
+; GFX8-NEXT:    s_endpgm
+;
+; GFX11-LABEL: struct_buffer_store_v4i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 idxen
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
   ret void
 }
@@ -192,6 +383,5 @@ declare void @llvm.amdgcn.struct.buffer.store.f16(half, <4 x i32>, i32, i32, i32
 declare void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32) #0
 declare void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half>, <4 x i32>, i32, i32, i32, i32) #0
 
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }


        


More information about the llvm-commits mailing list