[llvm] [AMDGPU][DAG] Enable ganging up of memcpy loads/stores for AMDGPU (PR #96185)
Fabian Ritter via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 25 07:01:42 PDT 2024
https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/96185
>From 3b2b399f4e3d07f6c3a6a663ef4540c9f1684106 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Thu, 20 Jun 2024 08:21:41 -0400
Subject: [PATCH 1/2] [AMDGPU][DAG] Enable ganging up of memcpy loads/stores
for AMDGPU
In the SelectionDAG lowering of the memcpy intrinsic, this optimization
introduces additional chains between fixed-size groups of loads and the
corresponding stores. While initially introduced to ensure that wider
load/store-pair instructions are generated on AArch64, this optimization also
improves code generation for AMDGPU: Ganged loads are scheduled into a clause;
stores only await completion of their corresponding load.
The chosen value of 32 performed good in microbenchmarks, values of 8, 16, or
64 would perform similarly.
The testcase updates are autogenerated by utils/update_llc_test_checks.py.
See also:
- PR introducing this optimization: https://reviews.llvm.org/D46477
Part of SWDEV-455845.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 3 +
.../AMDGPU/gfx-callable-argument-types.ll | 16 +-
.../test/CodeGen/AMDGPU/memcpy-fixed-align.ll | 32 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 4302 ++++++++---------
4 files changed, 2089 insertions(+), 2264 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 522b3a34161cd..63561ec3c77f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -67,6 +67,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
+ // Enable ganging up loads and stores in the memcpy DAG lowering.
+ MaxGluedStoresPerMemcpy = 32;
+
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
setOperationAction(ISD::LOAD, MVT::f32, Promote);
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index a118fa388f86d..645e48f1bb1ab 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -9074,8 +9074,8 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
-; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
+; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
+; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33
; GFX9-NEXT: v_writelane_b32 v40, s30, 0
; GFX9-NEXT: v_writelane_b32 v40, s31, 1
@@ -9113,9 +9113,9 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX9-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
; GFX9-NEXT: v_writelane_b32 v40, s63, 31
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32
+; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: v_readlane_b32 s63, v40, 31
; GFX9-NEXT: v_readlane_b32 s62, v40, 30
@@ -9167,17 +9167,17 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
; GFX10-NEXT: s_mov_b32 exec_lo, s4
; GFX10-NEXT: s_clause 0x2
-; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:20
-; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:16
+; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:16
+; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:20
; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s33
; GFX10-NEXT: v_writelane_b32 v40, s30, 0
; GFX10-NEXT: s_addk_i32 s32, 0x400
; GFX10-NEXT: s_mov_b32 s5, byval_align16_f64_arg at abs32@hi
; GFX10-NEXT: s_mov_b32 s4, byval_align16_f64_arg at abs32@lo
; GFX10-NEXT: s_waitcnt vmcnt(2)
-; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4
+; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32
+; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4
; GFX10-NEXT: v_writelane_b32 v40, s31, 1
; GFX10-NEXT: v_writelane_b32 v40, s34, 2
; GFX10-NEXT: v_writelane_b32 v40, s35, 3
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
index a5e0ceaa6b329..343925528a520 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -8,22 +8,22 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; MUBUF: ; %bb.0:
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
-; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
-; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
+; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
; MUBUF-NEXT: s_waitcnt vmcnt(2)
-; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
+; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
; MUBUF-NEXT: s_waitcnt vmcnt(3)
-; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28
-; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24
-; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20
-; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16
+; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
+; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
+; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
+; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
; MUBUF-NEXT: s_waitcnt vmcnt(6)
-; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:12
-; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:8
-; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:4
-; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32
+; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
+; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
+; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
+; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
; MUBUF-NEXT: ;;#ASMSTART
; MUBUF-NEXT: ; use v0
; MUBUF-NEXT: ;;#ASMEND
@@ -33,16 +33,16 @@ define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
; FLATSCR-LABEL: memcpy_fixed_align:
; FLATSCR: ; %bb.0:
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
+; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
-; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
-; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
; FLATSCR-NEXT: v_mov_b32_e32 v0, s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 offset:16
+; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
-; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32
+; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
; FLATSCR-NEXT: ;;#ASMSTART
; FLATSCR-NEXT: ; use v0
; FLATSCR-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 358f42dfe8dd5..166bd90d098d0 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -13,148 +13,104 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: v_mov_b32_e32 v0, s2
; CHECK-NEXT: v_mov_b32_e32 v1, s3
; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14
; CHECK-NEXT: v_mov_b32_e32 v3, s1
; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
-; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32
+; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:31
+; CHECK-NEXT: flat_load_ubyte v20, v[0:1] offset:30
+; CHECK-NEXT: flat_load_ubyte v21, v[0:1] offset:29
+; CHECK-NEXT: flat_load_ubyte v22, v[0:1] offset:28
+; CHECK-NEXT: flat_load_ubyte v23, v[0:1] offset:27
+; CHECK-NEXT: flat_load_ubyte v24, v[0:1] offset:26
+; CHECK-NEXT: flat_load_ubyte v25, v[0:1] offset:25
+; CHECK-NEXT: flat_load_ubyte v26, v[0:1] offset:24
+; CHECK-NEXT: flat_load_ubyte v27, v[0:1] offset:23
+; CHECK-NEXT: flat_load_ubyte v28, v[0:1] offset:22
+; CHECK-NEXT: flat_load_ubyte v29, v[0:1] offset:21
+; CHECK-NEXT: flat_load_ubyte v30, v[0:1] offset:20
+; CHECK-NEXT: flat_load_ubyte v31, v[0:1] offset:19
+; CHECK-NEXT: flat_load_ubyte v32, v[0:1] offset:18
+; CHECK-NEXT: flat_load_ubyte v33, v[0:1] offset:17
+; CHECK-NEXT: flat_load_ubyte v34, v[0:1] offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32
+; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:31
+; CHECK-NEXT: flat_store_byte v[2:3], v20 offset:30
+; CHECK-NEXT: flat_store_byte v[2:3], v21 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v22 offset:28
+; CHECK-NEXT: flat_store_byte v[2:3], v23 offset:27
+; CHECK-NEXT: flat_store_byte v[2:3], v24 offset:26
+; CHECK-NEXT: flat_store_byte v[2:3], v25 offset:25
+; CHECK-NEXT: flat_store_byte v[2:3], v26 offset:24
+; CHECK-NEXT: flat_store_byte v[2:3], v27 offset:23
+; CHECK-NEXT: flat_store_byte v[2:3], v28 offset:22
+; CHECK-NEXT: flat_store_byte v[2:3], v29 offset:21
+; CHECK-NEXT: flat_store_byte v[2:3], v30 offset:20
+; CHECK-NEXT: flat_store_byte v[2:3], v31 offset:19
+; CHECK-NEXT: flat_store_byte v[2:3], v32 offset:18
+; CHECK-NEXT: flat_store_byte v[2:3], v33 offset:17
+; CHECK-NEXT: flat_store_byte v[2:3], v34 offset:16
+; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -165,20 +121,20 @@ define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addr
; CHECK-LABEL: memcpy_p1_p1_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v12, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
@@ -189,32 +145,32 @@ define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr ad
; CHECK-LABEL: memcpy_p1_p4_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
@@ -231,394 +187,372 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:31
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:30
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:29
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:28
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:27
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:26
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:24
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:23
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:22
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:21
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:20
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:19
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:18
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:17
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:16
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:14
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:28
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:7
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:5
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:4
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:3
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:2
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:1
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:63
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:17
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:62
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:60
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:59
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:11
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:10
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:9
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:8
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
-; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
-; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:52
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:51
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:50
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:47
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:46
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:45
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:43
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:40
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:11
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:56
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:55
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:54
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:53
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:12
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:94
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:58
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:92
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:91
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:41
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:95
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:37
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:36
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:35
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:34
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:78
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:77
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:75
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:39
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:73
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:72
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:61
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:44
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:86
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:85
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:90
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:82
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:81
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:80
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:79
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:123
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:84
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:118
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:117
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:42
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:88
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:93
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:76
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:71
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:127
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:126
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:124
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:122
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:83
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:65
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:87
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:70
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:74
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:68
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:66
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:120
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:116
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:67
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:112
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:113
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:114
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:115
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:122
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
-; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:105
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:106
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:107
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:108
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:110
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:111
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:100
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:101
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:98
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:99
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:97
+; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:97
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -635,367 +569,366 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:18
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:14
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
-; CHECK-NEXT: flat_store_byte v[0:1], v3
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:7
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:6
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:5
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:4
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:3
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:1
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:7
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:6
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:5
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:51
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:4
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:3
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:2
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:1
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:47
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v16
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:11
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:10
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:9
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:8
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:53
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:52
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:51
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:50
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:47
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:49
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:57
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:56
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:55
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:54
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:35
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:34
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:33
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:32
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:13
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:60
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:93
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:63
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:45
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:46
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:40
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:39
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:38
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:37
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:36
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:81
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:127
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:80
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:126
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:79
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:59
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:42
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:75
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:121
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:61
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:45
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:44
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:95
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:85
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:92
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:83
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:82
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:78
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:124
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:41
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:88
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:43
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:94
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:91
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:90
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:77
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:123
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:74
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:70
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:116
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:89
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:76
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:122
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:73
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:119
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:72
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:69
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:115
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:71
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:117
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:65
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:112
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:126
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:121
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:120
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:119
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:118
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:116
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:111
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:117
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:99
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
-; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:96
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -1008,30 +941,30 @@ define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1043,454 +976,388 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v3, v2
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:31
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
-; CHECK-NEXT: ds_read_u8 v13, v2
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
-; CHECK-NEXT: flat_store_byte v[0:1], v13
-; CHECK-NEXT: s_endpgm
-entry:
- tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
-; CHECK-LABEL: memcpy_p0_p0_optsize:
-; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:31
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:30
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:29
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:28
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:27
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:26
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:20
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:19
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:25
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:23
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
-; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v3
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:56
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:57
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:58
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:59
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:60
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:63
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:61
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:62
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:48
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:49
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:63
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:60
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:59
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:58
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:55
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:52
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:57
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:56
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:55
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:54
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:53
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:50
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:48
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:64
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:67
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:68
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:69
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:70
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:71
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:88
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:89
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:90
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:91
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:92
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:95
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:93
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:94
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:80
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:81
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:95
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:94
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:93
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:92
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:91
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:90
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:87
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:84
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:83
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:89
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:88
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:87
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:86
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:85
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:82
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:80
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:120
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:121
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:122
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:123
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:127
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:125
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:126
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:112
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:113
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:127
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:126
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:125
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:124
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:123
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:122
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:120
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:115
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:104
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:105
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:106
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:107
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:108
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:109
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:111
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:112
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
+; CHECK-NEXT: s_endpgm
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
+; CHECK-LABEL: memcpy_p0_p0_optsize:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:9
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:10
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:11
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:12
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:13
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:14
+; CHECK-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-NEXT: v_mov_b32_e32 v2, s0
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v4
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:1
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:2
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:3
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:4
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:5
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:6
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:7
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:8
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:9
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:10
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:11
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:43
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:42
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:41
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:40
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:39
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:38
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:37
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:36
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:35
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32
+; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:31
+; CHECK-NEXT: flat_load_ubyte v20, v[0:1] offset:30
+; CHECK-NEXT: flat_load_ubyte v21, v[0:1] offset:29
+; CHECK-NEXT: flat_load_ubyte v22, v[0:1] offset:28
+; CHECK-NEXT: flat_load_ubyte v23, v[0:1] offset:27
+; CHECK-NEXT: flat_load_ubyte v24, v[0:1] offset:26
+; CHECK-NEXT: flat_load_ubyte v25, v[0:1] offset:25
+; CHECK-NEXT: flat_load_ubyte v26, v[0:1] offset:24
+; CHECK-NEXT: flat_load_ubyte v27, v[0:1] offset:23
+; CHECK-NEXT: flat_load_ubyte v28, v[0:1] offset:22
+; CHECK-NEXT: flat_load_ubyte v29, v[0:1] offset:21
+; CHECK-NEXT: flat_load_ubyte v30, v[0:1] offset:20
+; CHECK-NEXT: flat_load_ubyte v31, v[0:1] offset:19
+; CHECK-NEXT: flat_load_ubyte v32, v[0:1] offset:18
+; CHECK-NEXT: flat_load_ubyte v33, v[0:1] offset:17
+; CHECK-NEXT: flat_load_ubyte v34, v[0:1] offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:44
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:43
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:42
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:41
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:40
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:39
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:38
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:37
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:36
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:35
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32
+; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:31
+; CHECK-NEXT: flat_store_byte v[2:3], v20 offset:30
+; CHECK-NEXT: flat_store_byte v[2:3], v21 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v22 offset:28
+; CHECK-NEXT: flat_store_byte v[2:3], v23 offset:27
+; CHECK-NEXT: flat_store_byte v[2:3], v24 offset:26
+; CHECK-NEXT: flat_store_byte v[2:3], v25 offset:25
+; CHECK-NEXT: flat_store_byte v[2:3], v26 offset:24
+; CHECK-NEXT: flat_store_byte v[2:3], v27 offset:23
+; CHECK-NEXT: flat_store_byte v[2:3], v28 offset:22
+; CHECK-NEXT: flat_store_byte v[2:3], v29 offset:21
+; CHECK-NEXT: flat_store_byte v[2:3], v30 offset:20
+; CHECK-NEXT: flat_store_byte v[2:3], v31 offset:19
+; CHECK-NEXT: flat_store_byte v[2:3], v32 offset:18
+; CHECK-NEXT: flat_store_byte v[2:3], v33 offset:17
+; CHECK-NEXT: flat_store_byte v[2:3], v34 offset:16
+; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -1501,20 +1368,20 @@ define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addr
; CHECK-LABEL: memcpy_p1_p1_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
+; CHECK-NEXT: v_mov_b32_e32 v12, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx2 v[8:9], v12, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx2 v[10:11], v12, s[2:3] offset:39
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v12, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v12, s[2:3] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[8:9], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx2 v12, v[10:11], s[0:1] offset:39
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(3)
+; CHECK-NEXT: global_store_dwordx4 v12, v[4:7], s[0:1] offset:16
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
@@ -1525,32 +1392,32 @@ define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr ad
; CHECK-LABEL: memcpy_p1_p4_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
-; CHECK-NEXT: v_mov_b32_e32 v4, 0
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT: v_mov_b32_e32 v32, 0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v32, s[2:3]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v32, s[2:3] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v32, s[2:3] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v32, s[2:3] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v32, s[2:3] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v32, s[2:3] offset:80
+; CHECK-NEXT: global_load_dwordx4 v[24:27], v32, s[2:3] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[28:31], v32, s[2:3] offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(7)
+; CHECK-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1567,394 +1434,372 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:31
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:30
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:29
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:28
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:27
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:26
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:24
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:23
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:22
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:21
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:20
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:19
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:18
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:17
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:16
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:14
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:13
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:30
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:7
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:5
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:4
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:3
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:2
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:1
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1]
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:63
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:17
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:62
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:60
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:59
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:11
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:10
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:9
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:8
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
-; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
-; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
-; CHECK-NEXT: s_waitcnt vmcnt(35)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(34)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
-; CHECK-NEXT: s_waitcnt vmcnt(33)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(32)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
-; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
-; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
-; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:52
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:51
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:50
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:47
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:46
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:45
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:43
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:40
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:11
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:56
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:55
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
-; CHECK-NEXT: s_waitcnt vmcnt(36)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:54
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:53
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:12
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:94
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:58
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:92
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:91
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:41
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:95
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:37
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:36
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:35
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:34
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:78
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:77
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:75
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:39
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:73
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:72
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:61
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:44
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:86
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:85
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:90
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:82
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:81
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:80
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:79
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:123
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:84
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:118
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:117
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:42
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:88
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:93
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:76
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:71
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:127
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:126
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:124
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:122
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:83
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:65
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:87
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:70
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:74
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:68
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:66
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:120
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:116
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:64
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:67
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT: s_waitcnt vmcnt(5)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:112
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:113
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:114
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:115
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:122
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
-; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
-; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:105
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:106
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:107
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:108
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:110
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:111
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:100
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:101
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(31)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:98
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:99
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:97
+; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:96
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:112
+; CHECK-NEXT: s_waitcnt vmcnt(15)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:97
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1971,367 +1816,366 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:18
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:14
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
-; CHECK-NEXT: flat_store_byte v[0:1], v3
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:7
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:6
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:5
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:4
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:3
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:1
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:7
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:6
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:5
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:51
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:4
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:3
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:2
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:1
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:47
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v16
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:11
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:10
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:9
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:8
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:53
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:52
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:51
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:50
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:47
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:49
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:57
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:56
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:55
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:54
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:35
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:34
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:33
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:32
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:13
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:60
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:93
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:63
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:45
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:46
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:40
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:39
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:38
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:37
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:36
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:81
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:127
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:80
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:126
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:79
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:59
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:42
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:75
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:121
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:61
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:45
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:44
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:95
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:85
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:92
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:83
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:82
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:78
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:124
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:41
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:88
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:43
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:94
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:91
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:90
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:77
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:123
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:74
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:70
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:116
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:89
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:76
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:122
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:73
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:119
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:72
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:69
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:115
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:71
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:117
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:65
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:112
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:126
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:121
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:120
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:119
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:118
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:116
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:111
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:117
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:98
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:99
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
-; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:96
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -2344,30 +2188,30 @@ define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
-; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
-; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
-; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
-; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1]
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:16
+; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:32
+; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:48
+; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:64
+; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:80
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset1:1
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
-; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
-; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:2 offset1:3
+; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:96
+; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:4 offset1:5
; CHECK-NEXT: s_waitcnt vmcnt(4)
-; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:6 offset1:7
; CHECK-NEXT: s_waitcnt vmcnt(3)
-; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:8 offset1:9
; CHECK-NEXT: s_waitcnt vmcnt(2)
-; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:10 offset1:11
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:12 offset1:13
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:14 offset1:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
@@ -2379,298 +2223,276 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v3, v2
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:31
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:31
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:30
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:29
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:28
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:27
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:26
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:20
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:19
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:25
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:23
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v3
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:56
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:57
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:58
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:59
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:60
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:63
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:61
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:62
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:48
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:49
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:63
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:60
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:59
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:58
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:55
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:52
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:57
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:56
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:55
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:54
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:53
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:50
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:48
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:64
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:67
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:68
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:69
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:70
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:71
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:88
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:89
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:90
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:91
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:92
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:95
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:93
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:94
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:80
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:81
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:95
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:94
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:93
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:92
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:91
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:90
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:87
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:84
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:83
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:89
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:88
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:87
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:86
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:85
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:82
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:80
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:79
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
-; CHECK-NEXT: ds_read_u8 v13, v2
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
-; CHECK-NEXT: flat_store_byte v[0:1], v13
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:120
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:121
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:122
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:123
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:127
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:125
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:126
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:112
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:113
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:127
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:126
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:125
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:124
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:123
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:122
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:120
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:115
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:104
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:105
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:106
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:107
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:108
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:109
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:111
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:112
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
>From 6a803ad60254b61048c57ee9cf022561658d4dbc Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Tue, 25 Jun 2024 09:40:08 -0400
Subject: [PATCH 2/2] amend! [AMDGPU][DAG] Enable ganging up of memcpy
loads/stores for AMDGPU
[AMDGPU][DAG] Enable ganging up of memcpy loads/stores for AMDGPU
In the SelectionDAG lowering of the memcpy intrinsic, this optimization
introduces additional chains between fixed-size groups of loads and the
corresponding stores. While initially introduced to ensure that wider
load/store-pair instructions are generated on AArch64, this optimization also
improves code generation for AMDGPU: Ganged loads are scheduled into a clause;
stores only await completion of their corresponding load.
The chosen value of 16 performed good in microbenchmarks, values of 8, 32, or
64 would perform similarly, trading off performance in different scenarios.
The testcase updates are autogenerated by utils/update_llc_test_checks.py.
See also:
- PR introducing this optimization: https://reviews.llvm.org/D46477
Part of SWDEV-455845.
---
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 2 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 3536 ++++++++---------
2 files changed, 1768 insertions(+), 1770 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 63561ec3c77f3..7409fc33cbc3b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -68,7 +68,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
// Enable ganging up loads and stores in the memcpy DAG lowering.
- MaxGluedStoresPerMemcpy = 32;
+ MaxGluedStoresPerMemcpy = 16;
// Lower floating point store/load to integer store/load to reduce the number
// of patterns in tablegen.
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 166bd90d098d0..ae1f31272a15f 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -45,6 +45,39 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12
; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13
; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16
+; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46
; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45
; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44
@@ -60,24 +93,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34
; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33
; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:31
-; CHECK-NEXT: flat_load_ubyte v20, v[0:1] offset:30
-; CHECK-NEXT: flat_load_ubyte v21, v[0:1] offset:29
-; CHECK-NEXT: flat_load_ubyte v22, v[0:1] offset:28
-; CHECK-NEXT: flat_load_ubyte v23, v[0:1] offset:27
-; CHECK-NEXT: flat_load_ubyte v24, v[0:1] offset:26
-; CHECK-NEXT: flat_load_ubyte v25, v[0:1] offset:25
-; CHECK-NEXT: flat_load_ubyte v26, v[0:1] offset:24
-; CHECK-NEXT: flat_load_ubyte v27, v[0:1] offset:23
-; CHECK-NEXT: flat_load_ubyte v28, v[0:1] offset:22
-; CHECK-NEXT: flat_load_ubyte v29, v[0:1] offset:21
-; CHECK-NEXT: flat_load_ubyte v30, v[0:1] offset:20
-; CHECK-NEXT: flat_load_ubyte v31, v[0:1] offset:19
-; CHECK-NEXT: flat_load_ubyte v32, v[0:1] offset:18
-; CHECK-NEXT: flat_load_ubyte v33, v[0:1] offset:17
-; CHECK-NEXT: flat_load_ubyte v34, v[0:1] offset:16
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:15
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46
; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45
@@ -94,23 +111,7 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34
; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33
; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:31
-; CHECK-NEXT: flat_store_byte v[2:3], v20 offset:30
-; CHECK-NEXT: flat_store_byte v[2:3], v21 offset:29
-; CHECK-NEXT: flat_store_byte v[2:3], v22 offset:28
-; CHECK-NEXT: flat_store_byte v[2:3], v23 offset:27
-; CHECK-NEXT: flat_store_byte v[2:3], v24 offset:26
-; CHECK-NEXT: flat_store_byte v[2:3], v25 offset:25
-; CHECK-NEXT: flat_store_byte v[2:3], v26 offset:24
-; CHECK-NEXT: flat_store_byte v[2:3], v27 offset:23
-; CHECK-NEXT: flat_store_byte v[2:3], v28 offset:22
-; CHECK-NEXT: flat_store_byte v[2:3], v29 offset:21
-; CHECK-NEXT: flat_store_byte v[2:3], v30 offset:20
-; CHECK-NEXT: flat_store_byte v[2:3], v31 offset:19
-; CHECK-NEXT: flat_store_byte v[2:3], v32 offset:18
-; CHECK-NEXT: flat_store_byte v[2:3], v33 offset:17
-; CHECK-NEXT: flat_store_byte v[2:3], v34 offset:16
-; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:15
+; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -187,372 +188,372 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:31
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:30
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:29
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:28
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:27
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:26
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:24
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:23
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:22
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:21
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:20
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:19
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:18
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:17
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:15
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:14
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1]
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:13
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:26
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:7
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:24
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:5
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:23
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:4
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:22
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:3
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:21
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:2
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:20
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:1
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:19
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1]
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:18
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:63
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:14
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:17
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:62
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:15
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:60
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:59
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:11
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:10
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:9
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:7
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:52
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:51
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:5
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:50
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:4
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:49
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:3
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:48
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:2
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:47
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:1
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:46
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:62
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:59
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:40
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:11
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:56
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:10
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:55
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:9
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:8
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:53
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:52
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:51
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:32
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:12
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:49
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:94
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:58
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:60
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:47
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:46
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:91
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:41
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:50
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:95
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60
+; CHECK-NEXT: s_waitcnt vmcnt(33)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:56
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:55
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:54
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:53
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:33
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:78
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:32
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:77
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:12
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:94
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:75
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:58
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:92
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:73
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:91
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:72
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:61
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:63
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:44
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76
; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:41
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:86
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:40
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:85
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:45
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:37
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:82
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:36
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:81
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:35
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:34
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:79
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:78
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:39
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:84
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:73
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:118
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:72
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:117
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:61
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:44
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:43
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:88
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:93
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:95
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:76
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:90
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:71
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:82
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:127
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:86
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:85
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:80
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:81
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:126
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:79
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:124
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:77
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:83
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126
; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:84
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:65
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:42
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:87
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:88
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:93
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:74
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:68
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:66
-; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:76
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:75
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:120
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:71
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:116
-; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:83
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:87
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:67
-; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:74
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:70
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:68
-; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:67
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:66
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:113
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:114
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:115
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:104
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:65
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:64
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:127
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:126
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:124
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:123
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:122
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:120
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:117
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:105
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:106
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:107
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:108
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:110
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:111
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:100
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:101
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:102
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:118
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:116
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:98
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:99
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:97
-; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:96
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114
+; CHECK-NEXT: s_waitcnt vmcnt(34)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113
+; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:115
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:111
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:108
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:107
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:106
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:105
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:102
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:101
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:99
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -569,366 +570,362 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:31
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:30
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:29
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:28
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:27
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:26
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:25
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:23
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:22
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:21
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:20
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:19
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:17
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:15
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:7
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:6
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:5
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:4
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:3
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:2
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:1
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:11
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:10
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:9
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:7
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:6
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:5
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:4
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:3
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:2
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:1
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:11
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:10
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:9
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:55
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:8
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:53
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:52
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:51
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:50
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:47
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:49
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:48
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:57
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:56
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:55
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:54
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:35
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:34
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:33
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:32
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:13
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:60
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:93
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:63
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:46
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:40
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:39
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:38
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:37
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:36
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:81
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:127
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:80
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:79
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:59
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:42
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:75
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:61
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:45
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:91
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:44
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:95
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:85
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:92
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:83
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:82
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:64
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:78
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:41
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:88
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:43
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:94
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:91
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:90
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:77
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:74
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:70
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:89
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:76
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:73
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:72
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:69
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:71
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:65
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:104
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:64
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:127
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:126
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:124
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:121
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:105
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:106
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:107
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:108
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:100
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:120
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:101
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:119
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:102
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:118
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:103
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:116
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:111
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:117
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:98
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:97
-; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:96
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:108
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:104
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:101
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:99
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -976,142 +973,110 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: ds_read_u8 v3, v2
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:25
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:26
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:27
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:28
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:29
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:31
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:30
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:112
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:113
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:114
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:115
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:116
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:117
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:118
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:119
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:31
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:18
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:30
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:19
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:29
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:20
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:28
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:21
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:27
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:22
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:26
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:121
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:122
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:123
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:125
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:126
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:127
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:20
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:19
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:25
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:23
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT: flat_store_byte v[0:1], v3
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:56
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:57
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:58
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:59
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:60
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:63
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:61
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:62
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:48
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:104
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:107
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:108
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:109
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:110
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:111
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:63
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:50
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:51
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:52
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:60
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:53
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:59
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:54
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:58
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:55
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:81
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:82
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:83
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:84
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:85
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:86
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:87
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:52
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:57
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:56
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:41
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:55
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:54
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:53
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:44
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:50
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:49
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:88
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:89
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:90
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:91
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:92
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:93
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:94
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:95
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:44
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:41
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95
; CHECK-NEXT: ds_read_u8 v3, v2 offset:64
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
@@ -1120,151 +1085,185 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: ds_read_u8 v8, v2 offset:69
; CHECK-NEXT: ds_read_u8 v9, v2 offset:70
; CHECK-NEXT: ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:88
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:89
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:90
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:91
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:92
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:95
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:93
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:94
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:80
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:81
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:95
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:82
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:94
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:83
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:93
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:84
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:92
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:85
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:91
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:90
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:87
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:84
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:78
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:83
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:89
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:88
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:87
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:74
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:86
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:85
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:82
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:81
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:78
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:74
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:120
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:121
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:122
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:123
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:124
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:127
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:125
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:126
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:112
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:72
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:73
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:74
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:75
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:76
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:77
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:78
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:79
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:127
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:126
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:125
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:124
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:123
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:122
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:121
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:48
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:49
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:50
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:51
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:52
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:53
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:54
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:55
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:115
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:104
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:105
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:106
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:107
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:108
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:109
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:111
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:56
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:57
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:58
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:60
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:61
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:62
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:63
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:108
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:104
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
-; CHECK-NEXT: s_endpgm
-entry:
- tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
- ret void
-}
-
-define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
-; CHECK-LABEL: memcpy_p0_p0_optsize:
-; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-NEXT: v_mov_b32_e32 v1, s3
-; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
-; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1
-; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2
-; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3
-; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4
-; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:41
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:42
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:43
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:44
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:45
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:46
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:47
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47
+; CHECK-NEXT: ds_read_u8 v3, v2
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:18
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v26, v2 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:31
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v3
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT: s_endpgm
+entry:
+ tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
+ ret void
+}
+
+define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
+; CHECK-LABEL: memcpy_p0_p0_optsize:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:1
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:2
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:3
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:4
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:5
; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:6
; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:7
; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:8
@@ -1292,6 +1291,39 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:12
; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:13
; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:14
+; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:29
+; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:28
+; CHECK-NEXT: flat_load_ubyte v7, v[0:1] offset:27
+; CHECK-NEXT: flat_load_ubyte v8, v[0:1] offset:26
+; CHECK-NEXT: flat_load_ubyte v9, v[0:1] offset:25
+; CHECK-NEXT: flat_load_ubyte v10, v[0:1] offset:24
+; CHECK-NEXT: flat_load_ubyte v11, v[0:1] offset:23
+; CHECK-NEXT: flat_load_ubyte v12, v[0:1] offset:22
+; CHECK-NEXT: flat_load_ubyte v13, v[0:1] offset:21
+; CHECK-NEXT: flat_load_ubyte v14, v[0:1] offset:20
+; CHECK-NEXT: flat_load_ubyte v15, v[0:1] offset:19
+; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:18
+; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:17
+; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:16
+; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:15
+; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
+; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:29
+; CHECK-NEXT: flat_store_byte v[2:3], v6 offset:28
+; CHECK-NEXT: flat_store_byte v[2:3], v7 offset:27
+; CHECK-NEXT: flat_store_byte v[2:3], v8 offset:26
+; CHECK-NEXT: flat_store_byte v[2:3], v9 offset:25
+; CHECK-NEXT: flat_store_byte v[2:3], v10 offset:24
+; CHECK-NEXT: flat_store_byte v[2:3], v11 offset:23
+; CHECK-NEXT: flat_store_byte v[2:3], v12 offset:22
+; CHECK-NEXT: flat_store_byte v[2:3], v13 offset:21
+; CHECK-NEXT: flat_store_byte v[2:3], v14 offset:20
+; CHECK-NEXT: flat_store_byte v[2:3], v15 offset:19
+; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:18
+; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:17
+; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:16
+; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:15
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:46
; CHECK-NEXT: flat_load_ubyte v5, v[0:1] offset:45
; CHECK-NEXT: flat_load_ubyte v6, v[0:1] offset:44
@@ -1307,24 +1339,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-NEXT: flat_load_ubyte v16, v[0:1] offset:34
; CHECK-NEXT: flat_load_ubyte v17, v[0:1] offset:33
; CHECK-NEXT: flat_load_ubyte v18, v[0:1] offset:32
-; CHECK-NEXT: flat_load_ubyte v19, v[0:1] offset:31
-; CHECK-NEXT: flat_load_ubyte v20, v[0:1] offset:30
-; CHECK-NEXT: flat_load_ubyte v21, v[0:1] offset:29
-; CHECK-NEXT: flat_load_ubyte v22, v[0:1] offset:28
-; CHECK-NEXT: flat_load_ubyte v23, v[0:1] offset:27
-; CHECK-NEXT: flat_load_ubyte v24, v[0:1] offset:26
-; CHECK-NEXT: flat_load_ubyte v25, v[0:1] offset:25
-; CHECK-NEXT: flat_load_ubyte v26, v[0:1] offset:24
-; CHECK-NEXT: flat_load_ubyte v27, v[0:1] offset:23
-; CHECK-NEXT: flat_load_ubyte v28, v[0:1] offset:22
-; CHECK-NEXT: flat_load_ubyte v29, v[0:1] offset:21
-; CHECK-NEXT: flat_load_ubyte v30, v[0:1] offset:20
-; CHECK-NEXT: flat_load_ubyte v31, v[0:1] offset:19
-; CHECK-NEXT: flat_load_ubyte v32, v[0:1] offset:18
-; CHECK-NEXT: flat_load_ubyte v33, v[0:1] offset:17
-; CHECK-NEXT: flat_load_ubyte v34, v[0:1] offset:16
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:15
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:31
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:46
; CHECK-NEXT: flat_store_byte v[2:3], v5 offset:45
@@ -1341,23 +1357,7 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-NEXT: flat_store_byte v[2:3], v16 offset:34
; CHECK-NEXT: flat_store_byte v[2:3], v17 offset:33
; CHECK-NEXT: flat_store_byte v[2:3], v18 offset:32
-; CHECK-NEXT: flat_store_byte v[2:3], v19 offset:31
-; CHECK-NEXT: flat_store_byte v[2:3], v20 offset:30
-; CHECK-NEXT: flat_store_byte v[2:3], v21 offset:29
-; CHECK-NEXT: flat_store_byte v[2:3], v22 offset:28
-; CHECK-NEXT: flat_store_byte v[2:3], v23 offset:27
-; CHECK-NEXT: flat_store_byte v[2:3], v24 offset:26
-; CHECK-NEXT: flat_store_byte v[2:3], v25 offset:25
-; CHECK-NEXT: flat_store_byte v[2:3], v26 offset:24
-; CHECK-NEXT: flat_store_byte v[2:3], v27 offset:23
-; CHECK-NEXT: flat_store_byte v[2:3], v28 offset:22
-; CHECK-NEXT: flat_store_byte v[2:3], v29 offset:21
-; CHECK-NEXT: flat_store_byte v[2:3], v30 offset:20
-; CHECK-NEXT: flat_store_byte v[2:3], v31 offset:19
-; CHECK-NEXT: flat_store_byte v[2:3], v32 offset:18
-; CHECK-NEXT: flat_store_byte v[2:3], v33 offset:17
-; CHECK-NEXT: flat_store_byte v[2:3], v34 offset:16
-; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:15
+; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:31
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
@@ -1434,372 +1434,372 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_add_u32 s8, s8, s7
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:31
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:30
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:29
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:28
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:27
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:26
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:24
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:23
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:22
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:21
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:20
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:19
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:18
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:17
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:16
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:15
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:14
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:15
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:14
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:13
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:12
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:11
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:10
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:9
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:7
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:6
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:5
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:4
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:3
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:2
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:1
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1]
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:31
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:30
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: v_mov_b32_e32 v1, s2
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:13
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:29
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:15
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:14
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:13
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:12
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:11
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:26
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:7
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:23
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:22
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:24
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:5
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:21
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:23
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:4
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:20
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:22
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:3
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:19
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:21
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:2
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:20
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:1
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:19
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1]
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:18
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:63
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:16
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:14
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:17
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:62
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:15
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:60
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:59
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:11
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:10
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:9
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:7
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:52
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:51
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:5
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:50
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:18
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:47
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:27
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:26
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:25
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:24
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:45
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:44
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:43
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:4
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:49
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:36
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:3
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:48
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:35
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:2
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:47
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:34
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:1
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:46
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:33
; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:45
-; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:62
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:43
-; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:59
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:40
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:32
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:28
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:42
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:63
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:17
; CHECK-NEXT: s_waitcnt vmcnt(23)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:11
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:56
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:61
; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:10
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:55
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:40
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:9
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:54
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:8
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:53
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:52
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:33
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:51
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:32
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:12
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:39
; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:49
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:94
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:58
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:38
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:37
+; CHECK-NEXT: s_waitcnt vmcnt(27)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:57
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:60
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:47
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:92
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:46
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:91
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:58
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:41
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:50
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:95
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:60
+; CHECK-NEXT: s_waitcnt vmcnt(33)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:79
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:41
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:48
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:56
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:37
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:55
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:36
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:54
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:35
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:53
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:34
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:33
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:78
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:32
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:77
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:77
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:74
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:53
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:52
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:51
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:50
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:70
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:69
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:71
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT: s_waitcnt vmcnt(29)
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:93
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:59
+; CHECK-NEXT: s_waitcnt vmcnt(30)
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:73
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:54
+; CHECK-NEXT: s_waitcnt vmcnt(28)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:68
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:12
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:94
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:75
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:87
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:58
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:39
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:92
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:73
-; CHECK-NEXT: s_waitcnt vmcnt(24)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:91
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:72
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:66
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:65
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:61
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:63
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:44
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:76
; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:41
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:86
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:40
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:85
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:90
+; CHECK-NEXT: s_waitcnt vmcnt(26)
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:45
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:90
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:37
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:82
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:36
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:81
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:35
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:80
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:34
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:79
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:78
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:123
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:39
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:84
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:72
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:73
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:118
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:85
; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:72
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:117
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:61
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:42
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:44
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:67
+; CHECK-NEXT: s_waitcnt vmcnt(25)
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:81
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:111
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT: s_waitcnt vmcnt(23)
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:75
+; CHECK-NEXT: s_waitcnt vmcnt(24)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:89
+; CHECK-NEXT: s_waitcnt vmcnt(22)
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:43
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:88
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:84
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:82
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:93
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:100
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:95
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:76
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:90
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:71
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:82
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:127
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:78
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:86
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:85
-; CHECK-NEXT: s_waitcnt vmcnt(27)
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:80
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:81
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:126
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:79
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:124
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:108
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:77
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:122
-; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:83
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:106
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:88
+; CHECK-NEXT: s_waitcnt vmcnt(20)
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:102
+; CHECK-NEXT: s_waitcnt vmcnt(11)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:91
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:107
+; CHECK-NEXT: s_waitcnt vmcnt(12)
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:105
+; CHECK-NEXT: s_waitcnt vmcnt(8)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:101
+; CHECK-NEXT: s_waitcnt vmcnt(6)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:104
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:97
+; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:98
+; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:99
+; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:121
+; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:122
+; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:123
+; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:124
+; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:125
+; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:126
; CHECK-NEXT: s_waitcnt vmcnt(29)
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:84
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:65
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:42
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:87
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(25)
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:88
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:70
-; CHECK-NEXT: s_waitcnt vmcnt(26)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:93
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:74
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:68
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:66
-; CHECK-NEXT: s_waitcnt vmcnt(28)
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:76
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:127
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:75
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:120
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:106
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:71
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:116
-; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:83
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:64
-; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:87
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:67
-; CHECK-NEXT: s_waitcnt vmcnt(12)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:74
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:70
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:69
-; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:68
-; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:67
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:66
-; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:112
-; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:113
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:114
-; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:115
-; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:104
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:65
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:64
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:127
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:126
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:124
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:123
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:122
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:120
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:117
-; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:105
-; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:106
-; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:107
-; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:108
-; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:109
-; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:110
-; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:111
-; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:100
-; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:101
-; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:102
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_waitcnt vmcnt(31)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
-; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:103
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:118
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:116
-; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:98
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:99
-; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:97
-; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:96
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:116
+; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:117
+; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:119
+; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:114
+; CHECK-NEXT: s_waitcnt vmcnt(34)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:118
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:115
+; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:113
+; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:112
; CHECK-NEXT: s_waitcnt vmcnt(30)
-; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:115
-; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_waitcnt vmcnt(15)
-; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:111
-; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:108
-; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:107
-; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:106
-; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:105
-; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:96
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:103
-; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:102
-; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:101
-; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:120
; CHECK-NEXT: s_waitcnt vmcnt(18)
-; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:99
-; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT: s_waitcnt vmcnt(16)
+; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT: s_waitcnt vmcnt(18)
+; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:114
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:113
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -1816,366 +1816,362 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
; CHECK-NEXT: s_addc_u32 s9, s9, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, s0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:31
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:30
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:29
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:28
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:27
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:26
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:25
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:24
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:23
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:22
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:21
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:20
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:19
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:2
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:17
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:16
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:15
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:30
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: s_waitcnt vmcnt(17)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:15
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:30
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:29
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:28
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:27
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:26
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:25
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:7
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:24
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:6
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:23
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:5
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:22
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:4
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:21
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:3
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:20
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:2
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:19
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:1
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:18
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:9
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:8
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:7
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:6
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:5
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:4
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:3
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:1
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v18
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:2
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:23
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:37
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:17
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:22
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:36
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:16
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:21
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:35
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:12
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:11
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:10
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:9
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:8
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:7
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:20
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:34
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:6
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:19
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:33
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:5
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:18
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:32
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:4
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:29
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:3
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:30
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:44
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:2
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:17
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:63
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:1
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:42
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:26
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:40
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:12
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:25
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:39
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:11
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:24
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:38
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:10
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:27
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:41
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:9
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:55
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:8
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:45
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:59
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:53
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:37
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:36
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:50
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:52
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:35
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:49
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:51
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:34
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:48
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:50
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:46
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:47
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:61
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:14
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:29
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:43
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:47
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:44
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:58
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:33
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:79
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:49
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:56
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:48
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:40
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:54
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:57
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:39
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:53
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:56
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:38
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:52
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:55
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:55
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:54
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:59
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:73
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:35
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:51
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:65
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:34
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:50
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:64
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:33
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:62
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:32
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:63
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:77
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:13
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:46
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:60
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:60
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:75
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:93
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:57
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:58
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:72
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:63
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:49
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:95
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:46
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:56
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:70
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:40
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:68
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:39
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:53
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:67
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:38
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:52
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:66
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:37
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:55
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:69
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:36
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:73
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:87
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:81
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:127
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:80
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:126
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:79
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:65
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:111
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:59
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:64
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:110
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:42
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:62
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:75
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:77
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:91
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:61
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:60
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:74
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:75
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:89
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:45
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:91
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:71
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:44
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:72
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:86
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:95
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:70
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:84
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:85
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:68
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:83
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:67
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:81
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:92
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:66
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:80
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:78
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:83
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:79
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:93
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:82
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:69
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:82
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:78
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:87
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:101
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:41
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:76
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:90
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:88
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:91
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:105
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:43
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:74
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:88
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:94
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:89
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:103
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:91
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:71
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:85
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:90
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:86
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:100
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:77
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:78
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:92
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:74
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:93
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:107
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:90
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:104
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:70
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:88
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:102
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:89
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:85
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:99
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:76
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:94
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:73
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:95
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:109
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:72
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:92
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:106
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:69
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:94
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:81
+; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:111
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:99
+; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:103
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:100
+; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:108
+; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:125
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:71
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:112
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:65
-; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:112
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:113
-; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:114
-; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:104
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:64
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:127
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:126
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:125
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:124
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:123
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:121
-; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:105
-; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:106
-; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:107
-; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:108
-; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:109
-; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:110
-; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:100
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:122
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:120
-; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:101
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:119
-; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:102
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:118
-; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:103
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:116
-; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:111
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:127
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:126
; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:117
-; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:98
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:97
-; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:96
-; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:108
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:104
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:101
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:99
-; CHECK-NEXT: s_waitcnt vmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:119
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:112
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
@@ -2223,142 +2219,110 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: ds_read_u8 v3, v2
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:24
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:25
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:26
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:27
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:28
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:29
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:31
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:30
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:112
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:113
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:114
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:115
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:116
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:117
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:118
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:119
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:31
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:18
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:30
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:19
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:29
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:20
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:28
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:21
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:27
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:22
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:26
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:23
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:114
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:115
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:116
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:117
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:118
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:119
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:121
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:122
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:123
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:124
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:125
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:126
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:127
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:20
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:19
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:25
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:24
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:23
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:22
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:21
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:18
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:121
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:122
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:123
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:124
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:125
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:126
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:127
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:14
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:13
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:9
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:8
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
-; CHECK-NEXT: flat_store_byte v[0:1], v3
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:56
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:57
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:58
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:59
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:60
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:63
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:61
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:62
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:48
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:104
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:107
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:108
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:109
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:110
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:111
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:63
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:50
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:62
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:51
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:61
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:52
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:60
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:53
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:59
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:54
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:58
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:55
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:104
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:107
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:108
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:109
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:110
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:111
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:81
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:82
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:83
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:84
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:85
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:86
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:87
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:52
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:57
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:56
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:41
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:55
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:54
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:53
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:44
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:50
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:49
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:81
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:82
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:83
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:84
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:85
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:86
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:87
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:88
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:89
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:90
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:91
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:92
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:93
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:94
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:95
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:47
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:46
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:45
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:44
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:43
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:42
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:41
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:40
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:88
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:89
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:90
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:91
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:92
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:93
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:94
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:95
; CHECK-NEXT: ds_read_u8 v3, v2 offset:64
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
@@ -2367,132 +2331,166 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: ds_read_u8 v8, v2 offset:69
; CHECK-NEXT: ds_read_u8 v9, v2 offset:70
; CHECK-NEXT: ds_read_u8 v10, v2 offset:71
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:88
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:89
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:90
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:91
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:92
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:95
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:93
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:94
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:80
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:81
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:95
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:82
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:94
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:83
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:93
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:84
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:92
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:85
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:91
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:86
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:90
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:87
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:72
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:73
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:74
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:75
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:76
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:77
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:78
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:79
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:84
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:78
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:83
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:89
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:88
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:87
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:74
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:86
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:85
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:82
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:81
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:80
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:48
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:49
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:50
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:51
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:52
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:53
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:54
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:55
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:79
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:78
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:77
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:76
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:75
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:74
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:73
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:72
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:71
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:70
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:69
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:68
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:67
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:64
-; CHECK-NEXT: ds_read_u8 v3, v2 offset:96
-; CHECK-NEXT: ds_read_u8 v4, v2 offset:97
-; CHECK-NEXT: ds_read_u8 v5, v2 offset:98
-; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
-; CHECK-NEXT: ds_read_u8 v7, v2 offset:100
-; CHECK-NEXT: ds_read_u8 v8, v2 offset:101
-; CHECK-NEXT: ds_read_u8 v9, v2 offset:102
-; CHECK-NEXT: ds_read_u8 v10, v2 offset:103
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:120
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:121
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:122
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:123
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:124
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:127
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:125
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:126
-; CHECK-NEXT: ds_read_u8 v19, v2 offset:112
-; CHECK-NEXT: ds_read_u8 v20, v2 offset:113
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:48
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:49
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:50
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:51
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:52
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:53
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:54
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:55
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:56
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:57
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:58
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:60
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:61
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:62
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:63
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:127
-; CHECK-NEXT: ds_read_u8 v16, v2 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:126
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:115
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:125
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:124
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:123
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:122
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:121
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:120
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:56
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:57
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:58
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:60
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:61
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:62
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:63
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:32
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:33
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:34
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:35
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:36
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:37
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:38
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:39
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:118
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:117
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:119
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:116
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:115
-; CHECK-NEXT: ds_read_u8 v12, v2 offset:104
-; CHECK-NEXT: ds_read_u8 v11, v2 offset:105
-; CHECK-NEXT: ds_read_u8 v13, v2 offset:106
-; CHECK-NEXT: ds_read_u8 v14, v2 offset:107
-; CHECK-NEXT: ds_read_u8 v15, v2 offset:108
-; CHECK-NEXT: ds_read_u8 v17, v2 offset:109
-; CHECK-NEXT: ds_read_u8 v18, v2 offset:111
-; CHECK-NEXT: ds_read_u8 v2, v2 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:114
-; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:113
-; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:112
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:32
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:34
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:35
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:36
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:37
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:38
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:39
+; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:41
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:42
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:43
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:44
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:45
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:46
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:47
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:111
-; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:110
-; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:109
-; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:108
-; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:107
-; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:106
-; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:105
-; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:104
-; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:103
-; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:102
-; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:101
-; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:100
-; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
-; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:98
-; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:97
-; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:96
+; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:42
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:43
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:44
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:45
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:46
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:47
+; CHECK-NEXT: ds_read_u8 v3, v2
+; CHECK-NEXT: ds_read_u8 v4, v2 offset:1
+; CHECK-NEXT: ds_read_u8 v5, v2 offset:2
+; CHECK-NEXT: ds_read_u8 v6, v2 offset:3
+; CHECK-NEXT: ds_read_u8 v7, v2 offset:4
+; CHECK-NEXT: ds_read_u8 v8, v2 offset:5
+; CHECK-NEXT: ds_read_u8 v9, v2 offset:6
+; CHECK-NEXT: ds_read_u8 v10, v2 offset:7
+; CHECK-NEXT: ds_read_u8 v11, v2 offset:8
+; CHECK-NEXT: ds_read_u8 v12, v2 offset:9
+; CHECK-NEXT: ds_read_u8 v13, v2 offset:10
+; CHECK-NEXT: ds_read_u8 v14, v2 offset:11
+; CHECK-NEXT: ds_read_u8 v15, v2 offset:12
+; CHECK-NEXT: ds_read_u8 v16, v2 offset:13
+; CHECK-NEXT: ds_read_u8 v17, v2 offset:14
+; CHECK-NEXT: ds_read_u8 v18, v2 offset:15
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:16
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:17
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:18
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:19
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:20
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:21
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:22
+; CHECK-NEXT: ds_read_u8 v26, v2 offset:23
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:18
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:19
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:20
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:21
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:22
+; CHECK-NEXT: flat_store_byte v[0:1], v26 offset:23
+; CHECK-NEXT: ds_read_u8 v19, v2 offset:24
+; CHECK-NEXT: ds_read_u8 v20, v2 offset:25
+; CHECK-NEXT: ds_read_u8 v21, v2 offset:26
+; CHECK-NEXT: ds_read_u8 v22, v2 offset:27
+; CHECK-NEXT: ds_read_u8 v23, v2 offset:28
+; CHECK-NEXT: ds_read_u8 v24, v2 offset:29
+; CHECK-NEXT: ds_read_u8 v25, v2 offset:30
+; CHECK-NEXT: ds_read_u8 v2, v2 offset:31
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
+; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:24
+; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:25
+; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:26
+; CHECK-NEXT: flat_store_byte v[0:1], v22 offset:27
+; CHECK-NEXT: flat_store_byte v[0:1], v23 offset:28
+; CHECK-NEXT: flat_store_byte v[0:1], v24 offset:29
+; CHECK-NEXT: flat_store_byte v[0:1], v25 offset:30
+; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:31
+; CHECK-NEXT: flat_store_byte v[0:1], v3
+; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
More information about the llvm-commits
mailing list