[llvm] 11c2c16 - [NFC][AMDGPU] Rewrite two test files to avoid completely simplifying away after next patch

Roman Lebedev via llvm-commits llvm-commits at lists.llvm.org
Tue Nov 22 15:38:41 PST 2022


Author: Roman Lebedev
Date: 2022-11-23T02:38:25+03:00
New Revision: 11c2c163de4b7f65ce29cae4f2f36c73392115ae

URL: https://github.com/llvm/llvm-project/commit/11c2c163de4b7f65ce29cae4f2f36c73392115ae
DIFF: https://github.com/llvm/llvm-project/commit/11c2c163de4b7f65ce29cae4f2f36c73392115ae.diff

LOG: [NFC][AMDGPU] Rewrite two test files to avoid completely simplifying away after next patch

Added: 
    

Modified: 
    llvm/test/CodeGen/AMDGPU/flat-scratch.ll
    llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index d00a446ae0397..c0590ce38f28c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -29,6 +29,11 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zero_init_kernel:
@@ -38,6 +43,7 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
 ; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX10-NEXT:    s_mov_b32 s1, s0
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_mov_b32 s3, s0
@@ -49,12 +55,15 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v4
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: zero_init_kernel:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@@ -65,6 +74,9 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v4
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -93,6 +105,11 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:36
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:20
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:4
+; GFX9-PAL-NEXT:    s_nop 0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: zero_init_kernel:
@@ -107,6 +124,11 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: zero_init_kernel:
@@ -132,10 +154,14 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s2, 0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s1, 0
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:52
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:36
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:20
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:4
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v4
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: zero_init_kernel:
@@ -150,6 +176,7 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
 ; GFX1030-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
@@ -161,12 +188,15 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:36
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:20
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:4
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v4
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: zero_init_kernel:
 ; GFX11-PAL:       ; %bb.0:
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
@@ -177,11 +207,15 @@ define amdgpu_kernel void @zero_init_kernel() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:36
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:20
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v4
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
   ret void
 }
 
@@ -201,6 +235,11 @@ define void @zero_init_foo() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -209,6 +248,7 @@ define void @zero_init_foo() {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s32
 ; GFX10-NEXT:    s_mov_b32 s1, s0
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_mov_b32 s3, s0
@@ -220,6 +260,9 @@ define void @zero_init_foo() {
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v4
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -228,7 +271,7 @@ define void @zero_init_foo() {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mov_b32_e32 v4, s32
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@@ -239,6 +282,9 @@ define void @zero_init_foo() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v4
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -257,6 +303,11 @@ define void @zero_init_foo() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX9-PAL-NEXT:    s_nop 0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -273,6 +324,11 @@ define void @zero_init_foo() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_mov_b32_e32 v0, s32
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -281,6 +337,7 @@ define void @zero_init_foo() {
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, s32
 ; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
@@ -292,6 +349,9 @@ define void @zero_init_foo() {
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v4
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -300,7 +360,7 @@ define void @zero_init_foo() {
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v4, s32
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
@@ -311,26 +371,15 @@ define void @zero_init_foo() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:32
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:16
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v4
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: zero_init_foo:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s2, s0
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:32
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
   ret void
 }
 
@@ -351,6 +400,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_add_i32 s0, s0, 4
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_kernel:
@@ -371,6 +424,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_kernel:
@@ -387,6 +444,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_kernel:
@@ -409,6 +470,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_kernel:
@@ -425,6 +490,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_add_i32 s0, s0, 4
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX10-PAL-LABEL: store_load_sindex_kernel:
@@ -450,6 +519,10 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_kernel:
@@ -466,22 +539,11 @@ define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
-; GCN-LABEL: store_load_sindex_kernel:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v0, 15
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s1, s0, 2
-; GCN-NEXT:    s_and_b32 s0, s0, 15
-; GCN-NEXT:    s_lshl_b32 s0, s0, 2
-; GCN-NEXT:    s_add_u32 s1, 4, s1
-; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_add_u32 s0, 4, s0
-; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -492,6 +554,7 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -510,6 +573,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_add_i32 s0, s0, 4
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_foo:
@@ -528,6 +595,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_foo:
@@ -542,6 +613,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_foo:
@@ -563,6 +638,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s0, 4
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_foo:
@@ -577,6 +656,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX940-NEXT:    s_add_i32 s0, s0, 4
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX10-PAL-LABEL: store_load_sindex_foo:
@@ -600,6 +683,10 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_foo:
@@ -614,20 +701,11 @@ define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
-; GCN-LABEL: store_load_sindex_foo:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_lshl_b32 s1, s0, 2
-; GCN-NEXT:    s_and_b32 s0, s0, 15
-; GCN-NEXT:    s_lshl_b32 s0, s0, 2
-; GCN-NEXT:    s_add_u32 s1, 4, s1
-; GCN-NEXT:    v_mov_b32_e32 v0, 15
-; GCN-NEXT:    scratch_store_dword off, v0, s1 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_add_u32 s0, 4, s0
-; GCN-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -638,6 +716,7 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -654,6 +733,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 4, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_kernel:
@@ -670,6 +753,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_kernel:
@@ -681,6 +768,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_kernel:
@@ -700,6 +791,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_kernel:
@@ -711,6 +806,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 4, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX10-PAL-LABEL: store_load_vindex_kernel:
@@ -732,6 +831,10 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_kernel:
@@ -743,17 +846,11 @@ define amdgpu_kernel void @store_load_vindex_kernel() {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
-; GCN-LABEL: store_load_vindex_kernel:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    v_mov_b32_e32 v1, 15
-; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_sub_u32_e32 v0, 4, v0
-; GCN-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_endpgm
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -766,6 +863,7 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -782,6 +880,9 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v1
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_foo:
@@ -796,6 +897,10 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s32
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_foo:
@@ -810,6 +915,10 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, s32
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_foo:
@@ -824,6 +933,9 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v1
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_foo:
@@ -837,6 +949,10 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s32
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_vindex_foo:
@@ -851,6 +967,10 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s32
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_foo:
@@ -865,19 +985,11 @@ define void @store_load_vindex_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, s32
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_vindex_foo:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, 15
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
-; GCN-NEXT:    scratch_store_dword v1, v2, s32 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    scratch_load_dword v0, v0, s32 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [32 x float], align 4, addrspace(5)
   %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
@@ -888,6 +1000,7 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -951,13 +1064,6 @@ define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:4
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: private_ptr_foo:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x41200000
-; GCN-NEXT:    scratch_store_dword v0, v1, off offset:4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
   store float 1.000000e+01, float addrspace(5)* %gep, align 4
   ret void
@@ -987,6 +1093,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zero_init_small_offset_kernel:
@@ -998,6 +1113,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX10-NEXT:    s_mov_b32 s1, s0
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_mov_b32 s3, s0
@@ -1005,10 +1121,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x104
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v4
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v5
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: zero_init_small_offset_kernel:
@@ -1016,7 +1139,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
@@ -1027,6 +1150,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v4
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v5
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -1058,6 +1187,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:276
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:292
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:308
+; GFX9-PAL-NEXT:    s_nop 0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: zero_init_small_offset_kernel:
@@ -1074,6 +1212,15 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1103,9 +1250,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_mov_b32 s0, 0
 ; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:260
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v4, 4
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v5, 0x104
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:276
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:292
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:308
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v4
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v5
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1122,6 +1277,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX1030-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX1030-PAL-NEXT:    s_mov_b32 s3, s0
@@ -1129,10 +1285,17 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v5, 0x104
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:260
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:276
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:292
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], off offset:308
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v4
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v5
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
@@ -1140,7 +1303,7 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x104
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
@@ -1151,6 +1314,12 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:276
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:292
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], off offset:308
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v4
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v5
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1159,6 +1328,8 @@ define amdgpu_kernel void @zero_init_small_offset_kernel() {
   %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
   %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
   ret void
 }
 
@@ -1180,6 +1351,15 @@ define void @zero_init_small_offset_foo() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x100
+; GFX9-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1190,6 +1370,7 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-NEXT:    scratch_load_dword v0, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX10-NEXT:    s_mov_b32 s1, s0
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_mov_b32 s3, s0
@@ -1197,10 +1378,18 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    v_mov_b32_e32 v4, s32
+; GFX10-NEXT:    v_mov_b32_e32 v5, vcc_lo
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v4
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v5
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1211,17 +1400,24 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v4
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v5
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1242,6 +1438,15 @@ define void @zero_init_small_offset_foo() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x100
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1260,6 +1465,16 @@ define void @zero_init_small_offset_foo() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x100
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, s32
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1270,6 +1485,7 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
@@ -1277,10 +1493,18 @@ define void @zero_init_small_offset_foo() {
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, s32
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v5, vcc_lo
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v4
+; GFX10-PAL-NEXT:    ;;#ASMEND
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v5
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1291,42 +1515,34 @@ define void @zero_init_small_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v4, s32 :: v_dual_mov_b32 v5, vcc_lo
 ; GFX11-PAL-NEXT:    s_clause 0x3
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:256
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:272
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:288
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s32 offset:304
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v4
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v5
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: zero_init_small_offset_foo:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v0, off, s32 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s2, s0
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
-; GCN-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:256
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:272
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:288
-; GCN-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:304
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %alloca = alloca [32 x i16], align 2, addrspace(5)
   %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
   %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
   %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
   ret void
 }
 
@@ -1349,6 +1565,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_addk_i32 s0, 0x104
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_small_offset_kernel:
@@ -1361,6 +1585,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
@@ -1371,6 +1596,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
@@ -1378,7 +1610,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
@@ -1389,6 +1621,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1414,6 +1653,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
@@ -1432,6 +1679,14 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_addk_i32 s0, 0x104
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1447,6 +1702,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
@@ -1460,6 +1716,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v0
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v1
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1477,6 +1740,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
@@ -1487,6 +1751,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v0
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v1
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
@@ -1494,7 +1765,7 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
@@ -1505,6 +1776,13 @@ define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1519,6 +1797,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -1540,6 +1820,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_addk_i32 s0, 0x104
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_small_offset_foo:
@@ -1560,13 +1848,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_small_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
@@ -1576,6 +1872,13 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
@@ -1600,6 +1903,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x104
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_small_offset_foo:
@@ -1616,6 +1927,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX940-NEXT:    s_addk_i32 s0, 0x104
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
@@ -1642,6 +1961,14 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v0
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v1
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
@@ -1667,13 +1994,21 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v0
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v1
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x104
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
@@ -1683,8 +2018,15 @@ define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    s_endpgm
-bb:
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    s_endpgm
+bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
   %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
@@ -1697,6 +2039,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -1716,6 +2060,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v1
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_small_offset_kernel:
@@ -1734,6 +2086,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
@@ -1744,8 +2104,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1768,6 +2136,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v1
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_kernel:
@@ -1781,6 +2157,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x104, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x104
+; GFX940-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v1
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1805,6 +2189,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v0
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v1
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1828,6 +2220,14 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v0
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v1
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
@@ -1838,8 +2238,16 @@ define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
 ; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x104, v0
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, off offset:260 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x104
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
@@ -1856,6 +2264,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -1875,6 +2285,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v1
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_small_offset_foo:
@@ -1882,17 +2299,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX10-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
 ; GFX10-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s32
+; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_small_offset_foo:
@@ -1900,6 +2326,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -1908,6 +2335,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -1925,6 +2359,13 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, s32
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v1
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_small_offset_foo:
@@ -1940,6 +2381,15 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, s32
+; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x100
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -1947,17 +2397,26 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
+; GFX10-PAL-NEXT:    s_add_i32 s1, s32, 0x100
 ; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x100
-; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s0
 ; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s32
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v1
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
@@ -1965,6 +2424,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
+; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x100
 ; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
@@ -1973,21 +2433,14 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:256 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s32 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_vindex_small_offset_foo:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, 15
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
-; GCN-NEXT:    scratch_store_dword v1, v2, s32 offset:256 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [64 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -2001,6 +2454,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([64 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -2028,6 +2483,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
 ; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: zero_init_large_offset_kernel:
@@ -2050,10 +2514,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX10-NEXT:    s_movk_i32 s2, 0x4004
 ; GFX10-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX10-NEXT:    s_movk_i32 s0, 0x4004
+; GFX10-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s2
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x4004
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:16
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v4
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v5
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: zero_init_large_offset_kernel:
@@ -2070,11 +2542,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-NEXT:    s_movk_i32 s2, 0x4004
 ; GFX11-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX11-NEXT:    s_movk_i32 s0, 0x4004
+; GFX11-NEXT:    v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v4
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v5
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -2106,6 +2585,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
 ; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-PAL-NEXT:    s_nop 0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: zero_init_large_offset_kernel:
@@ -2126,6 +2614,15 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
 ; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2155,9 +2652,17 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_movk_i32 s0, 0x4004
 ; GFX1010-PAL-NEXT:    s_movk_i32 vcc_lo, 0x4004
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v4, 4
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v5, 0x4004
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:16
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX1010-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v4
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v5
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2185,10 +2690,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_movk_i32 s2, 0x4004
 ; GFX1030-PAL-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX1030-PAL-NEXT:    s_movk_i32 s0, 0x4004
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v4, 4
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v5, 0x4004
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:16
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX1030-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v4
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v5
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: zero_init_large_offset_kernel:
@@ -2205,11 +2718,18 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
 ; GFX11-PAL-NEXT:    s_movk_i32 s2, 0x4004
 ; GFX11-PAL-NEXT:    s_movk_i32 s1, 0x4004
 ; GFX11-PAL-NEXT:    s_movk_i32 s0, 0x4004
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v4, 4 :: v_dual_mov_b32 v5, 0x4004
 ; GFX11-PAL-NEXT:    s_clause 0x3
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:16
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v4
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v5
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-PAL-NEXT:    s_endpgm
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2218,6 +2738,8 @@ define amdgpu_kernel void @zero_init_large_offset_kernel() {
   %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
   %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
   ret void
 }
 
@@ -2235,14 +2757,24 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    s_add_i32 s3, s32, 0x4004
+; GFX9-NEXT:    s_add_i32 s2, s32, 0x4004
 ; GFX9-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX9-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2253,6 +2785,7 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    s_add_i32 s4, s32, 0x4004
 ; GFX10-NEXT:    s_mov_b32 s1, s0
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_mov_b32 s3, s0
@@ -2260,14 +2793,23 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-NEXT:    s_add_i32 s3, s32, 4
 ; GFX10-NEXT:    s_add_i32 s2, s32, 0x4004
 ; GFX10-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s2
+; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:16
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX10-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v4
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v5
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2278,21 +2820,29 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s4, s32, 4
 ; GFX11-NEXT:    s_mov_b32 s1, s0
 ; GFX11-NEXT:    s_mov_b32 s2, s0
 ; GFX11-NEXT:    s_mov_b32 s3, s0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_add_i32 s3, s32, 0x4004
 ; GFX11-NEXT:    s_add_i32 s2, s32, 0x4004
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX11-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3
 ; GFX11-NEXT:    s_clause 0x3
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v4
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v5
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2309,14 +2859,24 @@ define void @zero_init_large_offset_foo() {
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-PAL-NEXT:    s_add_i32 s3, s32, 0x4004
+; GFX9-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
 ; GFX9-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX9-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX9-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX9-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2331,14 +2891,25 @@ define void @zero_init_large_offset_foo() {
 ; GFX940-NEXT:    s_mov_b32 s3, s0
 ; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
 ; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX940-NEXT:    s_add_i32 s3, s32, 0x4004
+; GFX940-NEXT:    s_add_i32 s2, s32, 0x4004
 ; GFX940-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX940-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX940-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s3
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s2 offset:16
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:32
+; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:48
 ; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s1
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:16
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:32
-; GFX940-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_hi offset:48
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2349,6 +2920,7 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s32 offset:4 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-PAL-NEXT:    s_mov_b32 s0, 0
+; GFX10-PAL-NEXT:    s_add_i32 s4, s32, 0x4004
 ; GFX10-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX10-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX10-PAL-NEXT:    s_mov_b32 s3, s0
@@ -2356,14 +2928,23 @@ define void @zero_init_large_offset_foo() {
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX10-PAL-NEXT:    s_add_i32 s3, s32, 4
 ; GFX10-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
 ; GFX10-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s2
+; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s4
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v4, s3
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v5, s2
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s1 offset:16
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 offset:32
 ; GFX10-PAL-NEXT:    scratch_store_dwordx4 off, v[0:3], vcc_lo offset:48
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v4
+; GFX10-PAL-NEXT:    ;;#ASMEND
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v5
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2374,21 +2955,29 @@ define void @zero_init_large_offset_foo() {
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s32 offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_mov_b32 s0, 0
-; GFX11-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-PAL-NEXT:    s_add_i32 s4, s32, 4
 ; GFX11-PAL-NEXT:    s_mov_b32 s1, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s2, s0
 ; GFX11-PAL-NEXT:    s_mov_b32 s3, s0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX11-PAL-NEXT:    s_add_i32 s3, s32, 0x4004
 ; GFX11-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
 ; GFX11-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
 ; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s3
 ; GFX11-PAL-NEXT:    s_clause 0x3
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s2
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s1 offset:16
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], s0 offset:32
 ; GFX11-PAL-NEXT:    scratch_store_b128 off, v[0:3], vcc_lo offset:48
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v4
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v5
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2397,6 +2986,8 @@ define void @zero_init_large_offset_foo() {
   %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
   %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x i16] addrspace(5)* %alloca) #0
   ret void
 }
 
@@ -2419,6 +3010,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-NEXT:    s_addk_i32 s0, 0x4004
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_large_offset_kernel:
@@ -2431,6 +3030,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s1, s0, 15
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
@@ -2441,6 +3041,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_large_offset_kernel:
@@ -2448,7 +3055,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x24
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
@@ -2459,6 +3066,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2484,6 +3098,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_large_offset_kernel:
@@ -2502,6 +3124,14 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX940-NEXT:    s_addk_i32 s0, 0x4004
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2517,6 +3147,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
 ; GFX1010-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
 ; GFX1010-PAL-NEXT:    s_mov_b32 vcc_lo, 0
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 15
@@ -2530,6 +3161,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v0
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v1
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2547,6 +3185,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, off offset:4 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX1030-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX1030-PAL-NEXT:    s_lshl_b32 s0, s0, 2
@@ -2557,6 +3196,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v0
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v1
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel:
@@ -2564,7 +3210,7 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
 ; GFX11-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
@@ -2575,6 +3221,13 @@ define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2589,6 +3242,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -2610,6 +3265,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX9-NEXT:    s_addk_i32 s0, 0x4004
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_sindex_large_offset_foo:
@@ -2630,13 +3293,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_sindex_large_offset_foo:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
 ; GFX11-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 2
@@ -2646,6 +3317,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
@@ -2670,6 +3348,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX9-PAL-NEXT:    s_addk_i32 s0, 0x4004
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_sindex_large_offset_foo:
@@ -2686,6 +3372,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX940-NEXT:    s_addk_i32 s0, 0x4004
 ; GFX940-NEXT:    scratch_load_dword v0, off, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
@@ -2712,6 +3406,14 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v0
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v1
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
@@ -2737,13 +3439,21 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s1 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v0
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v1
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo:
 ; GFX11-PAL:       ; %bb.0: ; %bb
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, off offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 15
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, 0x4004
 ; GFX11-PAL-NEXT:    s_and_b32 s1, s0, 15
 ; GFX11-PAL-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX11-PAL-NEXT:    s_lshl_b32 s1, s1, 2
@@ -2753,6 +3463,13 @@ define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, off, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2767,6 +3484,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -2786,6 +3505,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v1
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vindex_large_offset_kernel:
@@ -2804,6 +3531,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
@@ -2815,8 +3550,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
 ; GFX11-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX11-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2839,6 +3582,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX9-PAL-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v1
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vindex_large_offset_kernel:
@@ -2853,6 +3604,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX940-NEXT:    v_sub_u32_e32 v0, 0x4004, v0
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:124 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0x4004
+; GFX940-NEXT:    v_mov_b32_e32 v1, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v1
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2877,6 +3636,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v0
+; GFX1010-PAL-NEXT:    ;;#ASMEND
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v1
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2900,6 +3667,14 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, v0, off offset:124 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v0
+; GFX1030-PAL-NEXT:    ;;#ASMEND
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v1
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
@@ -2911,8 +3686,16 @@ define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
 ; GFX11-PAL-NEXT:    v_sub_nc_u32_e32 v2, 0x4004, v0
 ; GFX11-PAL-NEXT:    scratch_store_b32 v0, v1, vcc_lo dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v1, 0x4004
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v2, off offset:124 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
@@ -2929,6 +3712,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -2938,8 +3723,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
+; GFX9-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_lo
 ; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-NEXT:    v_and_b32_e32 v0, 15, v0
@@ -2948,6 +3733,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v1
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_vindex_large_offset_foo:
@@ -2955,17 +3748,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-NEXT:    s_add_i32 s2, s32, 0x4004
+; GFX10-NEXT:    s_add_i32 s1, s32, 0x4004
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-NEXT:    v_lshl_add_u32 v1, v1, 2, s1
 ; GFX10-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_add_i32 s0, s32, 4
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
 ; GFX10-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v1
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_vindex_large_offset_foo:
@@ -2973,17 +3776,25 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX11-NEXT:    s_add_i32 s2, s32, 0x4004
+; GFX11-NEXT:    s_add_i32 s1, s32, 0x4004
+; GFX11-NEXT:    s_add_i32 s0, s32, 4
 ; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    scratch_store_b32 v0, v2, s0 dlc
+; GFX11-NEXT:    scratch_store_b32 v0, v2, s2 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
+; GFX11-NEXT:    scratch_load_b32 v0, v1, s1 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v1
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -2991,8 +3802,8 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v1, off, s32 offset:4 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_hi
+; GFX9-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v3, 15
 ; GFX9-PAL-NEXT:    v_and_b32_e32 v0, 15, v0
@@ -3001,6 +3812,14 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v1
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_vindex_large_offset_foo:
@@ -3010,14 +3829,24 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
 ; GFX940-NEXT:    v_mov_b32_e32 v2, 15
-; GFX940-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
+; GFX940-NEXT:    s_add_i32 s1, s32, 0x4004
 ; GFX940-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX940-NEXT:    scratch_store_dword v1, v2, vcc_lo sc0 sc1
+; GFX940-NEXT:    scratch_store_dword v1, v2, s1 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
-; GFX940-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
+; GFX940-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX940-NEXT:    scratch_load_dword v0, v0, s0 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX940-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 0x4004
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
+; GFX940-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -3025,17 +3854,27 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    v_and_b32_e32 v1, 15, v0
-; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
-; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s0
+; GFX10-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
+; GFX10-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, s2
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v2, 15
-; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, vcc_lo
+; GFX10-PAL-NEXT:    v_lshl_add_u32 v1, v1, 2, s1
 ; GFX10-PAL-NEXT:    scratch_load_dword v3, off, s32 offset:4 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_add_i32 s0, s32, 4
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
 ; GFX10-PAL-NEXT:    scratch_store_dword v0, v2, off
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v1, off glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, vcc_lo
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v1
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
@@ -3043,34 +3882,26 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
-; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 0x4004
+; GFX11-PAL-NEXT:    s_add_i32 s2, s32, 0x4004
+; GFX11-PAL-NEXT:    s_add_i32 s1, s32, 0x4004
+; GFX11-PAL-NEXT:    s_add_i32 s0, s32, 4
 ; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 0x4004
-; GFX11-PAL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-PAL-NEXT:    v_lshlrev_b32_e32 v1, 2, v1
 ; GFX11-PAL-NEXT:    scratch_load_b32 v3, off, s32 offset:4 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s0 dlc
+; GFX11-PAL-NEXT:    scratch_store_b32 v0, v2, s2 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, vcc_lo glc dlc
+; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s1 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, vcc_lo
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v1
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_vindex_large_offset_foo:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    scratch_load_dword v1, off, s32 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, 15
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GCN-NEXT:    v_and_b32_e32 v0, v0, v2
-; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
-; GCN-NEXT:    scratch_store_dword v1, v2, vcc_hi sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
-; GCN-NEXT:    scratch_load_dword v0, v0, vcc_hi sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %padding = alloca [4096 x i32], align 4, addrspace(5)
   %i = alloca [32 x float], align 4, addrspace(5)
@@ -3084,6 +3915,8 @@ bb:
   %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
   %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
   %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %padding) #0
+  call void asm sideeffect "; use $0", "s"([32 x float] addrspace(5)* %i) #0
   ret void
 }
 
@@ -3103,6 +3936,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_large_imm_offset_kernel:
@@ -3121,6 +3958,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_kernel:
@@ -3133,6 +3974,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3155,6 +4000,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_large_imm_offset_kernel:
@@ -3168,6 +4017,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3192,6 +4045,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX1010-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1010-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
 ; GFX1010-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1010-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1010-PAL-NEXT:    ;;#ASMSTART
+; GFX1010-PAL-NEXT:    ; use v0
+; GFX1010-PAL-NEXT:    ;;#ASMEND
 ; GFX1010-PAL-NEXT:    s_endpgm
 ;
 ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3215,6 +4072,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX1030-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX1030-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
 ; GFX1030-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1030-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX1030-PAL-NEXT:    ;;#ASMSTART
+; GFX1030-PAL-NEXT:    ; use v0
+; GFX1030-PAL-NEXT:    ;;#ASMEND
 ; GFX1030-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
@@ -3227,6 +4088,10 @@ define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, off offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
@@ -3236,6 +4101,7 @@ bb:
   store volatile i32 15, i32 addrspace(5)* %i7, align 4
   %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
   %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0
   ret void
 }
 
@@ -3245,15 +4111,20 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT:    s_add_i32 vcc_lo, s32, 4
 ; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_add_i32 s0, s0, vcc_hi
+; GFX9-NEXT:    s_add_i32 s0, s0, vcc_lo
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v0
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: store_load_large_imm_offset_foo:
@@ -3263,14 +4134,19 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-NEXT:    s_movk_i32 s0, 0x3800
-; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
-; GFX10-NEXT:    s_add_i32 s0, s0, vcc_lo
+; GFX10-NEXT:    s_add_i32 s1, s32, 4
+; GFX10-NEXT:    s_add_i32 s0, s0, s1
 ; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX10-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: store_load_large_imm_offset_foo:
@@ -3279,12 +4155,17 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-NEXT:    s_add_i32 vcc_lo, s32, 4
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3292,15 +4173,20 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX9-PAL-NEXT:    s_movk_i32 s0, 0x3000
-; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_hi
+; GFX9-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-PAL-NEXT:    scratch_store_dword off, v0, s0 offset:3712
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:3712 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v0
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX940-LABEL: store_load_large_imm_offset_foo:
@@ -3315,6 +4201,11 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, v0, s32 offset:3716 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    s_add_i32 vcc_hi, s32, 4
+; GFX940-NEXT:    v_mov_b32_e32 v0, vcc_hi
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3324,14 +4215,19 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 13
 ; GFX10-PAL-NEXT:    v_mov_b32_e32 v1, 15
 ; GFX10-PAL-NEXT:    s_movk_i32 s0, 0x3800
-; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
-; GFX10-PAL-NEXT:    s_add_i32 s0, s0, vcc_lo
+; GFX10-PAL-NEXT:    s_add_i32 s1, s32, 4
+; GFX10-PAL-NEXT:    s_add_i32 s0, s0, s1
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v0, s32 offset:4
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_store_dword off, v1, s0 offset:1664
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, off, s0 offset:1664 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
@@ -3340,26 +4236,18 @@ define void @store_load_large_imm_offset_foo() {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000
 ; GFX11-PAL-NEXT:    v_mov_b32_e32 v2, 15
+; GFX11-PAL-NEXT:    s_add_i32 vcc_lo, s32, 4
 ; GFX11-PAL-NEXT:    scratch_store_b32 off, v0, s32 offset:4 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_store_b32 v1, v2, s32 offset:3716 dlc
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v1, s32 offset:3716 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, vcc_lo
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_large_imm_offset_foo:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 13
-; GCN-NEXT:    scratch_store_dword off, v0, s32 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x3000
-; GCN-NEXT:    v_mov_b32_e32 v1, 15
-; GCN-NEXT:    scratch_store_dword v0, v1, s32 offset:3712 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_load_dword v0, v0, s32 offset:3712 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = alloca [4096 x i32], align 4, addrspace(5)
   %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
@@ -3368,6 +4256,7 @@ bb:
   store volatile i32 15, i32 addrspace(5)* %i7, align 4
   %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
   %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+  call void asm sideeffect "; use $0", "s"([4096 x i32] addrspace(5)* %i) #0
   ret void
 }
 
@@ -3378,14 +4267,17 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX9-NEXT:    scratch_store_dword v0, v2, off offset:1024
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    ;;#ASMSTART
+; GFX9-NEXT:    ; use v1
+; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: store_load_vidx_sidx_offset:
@@ -3403,6 +4295,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-NEXT:    ;;#ASMSTART
+; GFX10-NEXT:    ; use v0
+; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: store_load_vidx_sidx_offset:
@@ -3415,6 +4311,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-NEXT:    ;;#ASMSTART
+; GFX11-NEXT:    ; use v0
+; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_endpgm
 ;
 ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
@@ -3424,17 +4324,20 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX9-PAL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 4
 ; GFX9-PAL-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GFX9-PAL-NEXT:    v_mov_b32_e32 v2, 15
 ; GFX9-PAL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-PAL-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX9-PAL-NEXT:    s_add_u32 flat_scratch_lo, s4, s3
 ; GFX9-PAL-NEXT:    v_add_u32_e32 v0, s0, v0
 ; GFX9-PAL-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
 ; GFX9-PAL-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
-; GFX9-PAL-NEXT:    v_mov_b32_e32 v1, 15
-; GFX9-PAL-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX9-PAL-NEXT:    scratch_store_dword v0, v2, off offset:1024
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc
 ; GFX9-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-PAL-NEXT:    ;;#ASMSTART
+; GFX9-PAL-NEXT:    ; use v1
+; GFX9-PAL-NEXT:    ;;#ASMEND
 ; GFX9-PAL-NEXT:    s_endpgm
 ;
 ; GFX940-LABEL: store_load_vidx_sidx_offset:
@@ -3447,6 +4350,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v0, 4
+; GFX940-NEXT:    ;;#ASMSTART
+; GFX940-NEXT:    ; use v0
+; GFX940-NEXT:    ;;#ASMEND
 ; GFX940-NEXT:    s_endpgm
 ;
 ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
@@ -3469,6 +4376,10 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX10-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-PAL-NEXT:    scratch_load_dword v0, v0, off offset:1024 glc dlc
 ; GFX10-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX10-PAL-NEXT:    ;;#ASMSTART
+; GFX10-PAL-NEXT:    ; use v0
+; GFX10-PAL-NEXT:    ;;#ASMEND
 ; GFX10-PAL-NEXT:    s_endpgm
 ;
 ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
@@ -3481,18 +4392,11 @@ define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
 ; GFX11-PAL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-PAL-NEXT:    scratch_load_b32 v0, v0, off offset:1028 glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-PAL-NEXT:    v_mov_b32_e32 v0, 4
+; GFX11-PAL-NEXT:    ;;#ASMSTART
+; GFX11-PAL-NEXT:    ; use v0
+; GFX11-PAL-NEXT:    ;;#ASMEND
 ; GFX11-PAL-NEXT:    s_endpgm
-; GCN-LABEL: store_load_vidx_sidx_offset:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v1, 15
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_lshl_u32 v0, s0, v0, 2
-; GCN-NEXT:    scratch_store_dword v0, v1, off offset:1028 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_load_dword v0, v0, off offset:1028 sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_endpgm
 bb:
   %alloca = alloca [32 x i32], align 4, addrspace(5)
   %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -3501,6 +4405,7 @@ bb:
   %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
   store volatile i32 15, i32 addrspace(5)* %gep, align 4
   %load = load volatile i32, i32 addrspace(5)* %gep, align 4
+  call void asm sideeffect "; use $0", "s"([32 x i32] addrspace(5)* %alloca) #0
   ret void
 }
 
@@ -3583,16 +4488,6 @@ define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
 ; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_i64_aligned:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, 15
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, i64 addrspace(5)* %arg, align 8
   %load = load volatile i64, i64 addrspace(5)* %arg, align 8
@@ -3678,16 +4573,6 @@ define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
 ; GFX11-PAL-NEXT:    scratch_load_b64 v[0:1], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_i64_unaligned:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, 15
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile i64 15, i64 addrspace(5)* %arg, align 1
   %load = load volatile i64, i64 addrspace(5)* %arg, align 1
@@ -3780,17 +4665,6 @@ define void @store_load_v3i32_unaligned(<3 x i32> addrspace(5)* nocapture %arg)
 ; GFX11-PAL-NEXT:    scratch_load_b96 v[0:2], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_v3i32_unaligned:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, 2
-; GCN-NEXT:    v_mov_b32_e32 v4, 3
-; GCN-NEXT:    scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <3 x i32> <i32 1, i32 2, i32 3>, <3 x i32> addrspace(5)* %arg, align 1
   %load = load volatile <3 x i32>, <3 x i32> addrspace(5)* %arg, align 1
@@ -3888,18 +4762,6 @@ define void @store_load_v4i32_unaligned(<4 x i32> addrspace(5)* nocapture %arg)
 ; GFX11-PAL-NEXT:    scratch_load_b128 v[0:3], v0, off glc dlc
 ; GFX11-PAL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-PAL-NEXT:    s_setpc_b64 s[30:31]
-; GCN-LABEL: store_load_v4i32_unaligned:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, 1
-; GCN-NEXT:    v_mov_b32_e32 v3, 2
-; GCN-NEXT:    v_mov_b32_e32 v4, 3
-; GCN-NEXT:    v_mov_b32_e32 v5, 4
-; GCN-NEXT:    scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
 bb:
   store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %arg, align 1
   %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* %arg, align 1

diff  --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index 6be01670a7beb..3e1cb1ffb2995 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -10,6 +10,7 @@ define void @memcpy_fixed_align(i8 addrspace(5)*  %dst, i8 addrspace(1)* %src) {
 ; MUBUF-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
 ; MUBUF-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
 ; MUBUF-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off
+; MUBUF-NEXT:    v_lshrrev_b32_e64 v0, 6, s32
 ; MUBUF-NEXT:    s_waitcnt vmcnt(2)
 ; MUBUF-NEXT:    buffer_store_dword v12, off, s[0:3], s32 offset:36
 ; MUBUF-NEXT:    buffer_store_dword v11, off, s[0:3], s32 offset:32
@@ -23,6 +24,9 @@ define void @memcpy_fixed_align(i8 addrspace(5)*  %dst, i8 addrspace(1)* %src) {
 ; MUBUF-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:8
 ; MUBUF-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:4
 ; MUBUF-NEXT:    buffer_store_dword v7, off, s[0:3], s32
+; MUBUF-NEXT:    ;;#ASMSTART
+; MUBUF-NEXT:    ; use v0
+; MUBUF-NEXT:    ;;#ASMEND
 ; MUBUF-NEXT:    s_waitcnt vmcnt(0)
 ; MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -32,17 +36,22 @@ define void @memcpy_fixed_align(i8 addrspace(5)*  %dst, i8 addrspace(1)* %src) {
 ; FLATSCR-NEXT:    global_load_dwordx2 v[11:12], v[1:2], off offset:32
 ; FLATSCR-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
 ; FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v[1:2], off
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; FLATSCR-NEXT:    scratch_store_dwordx2 off, v[11:12], s32 offset:32
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[3:6], s32 offset:16
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(2)
 ; FLATSCR-NEXT:    scratch_store_dwordx4 off, v[7:10], s32
+; FLATSCR-NEXT:    ;;#ASMSTART
+; FLATSCR-NEXT:    ; use v0
+; FLATSCR-NEXT:    ;;#ASMEND
 ; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [40 x i8], addrspace(5)
   %cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 4 dereferenceable(40) %cast, i8 addrspace(1)* align 4 dereferenceable(40) %src, i64 40, i1 false)
+  call void asm sideeffect "; use $0", "s"([40 x i8] addrspace(5)* %alloca) #0
   ret void
 }
 


        


More information about the llvm-commits mailing list