[llvm] [AMDGPU][CodeGen] Improve handling of memcpy for -Os/-Oz compilations (PR #87632)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Mon Apr 15 09:44:22 PDT 2024


https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/87632

>From 4e38ff28e1303ddf042245a92178d91a32ef1d88 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Mon, 15 Apr 2024 12:44:09 -0400
Subject: [PATCH] [AMDGPU][CodeGen] Improve handling of memcpy for -Os/-Oz
 compilations

We had some instances when LLVM would not inline fixed-count memcpy and ended up
attempting to lower it a a libcall, which would not work on AMDGPU as the
address space doesn't meet the requirement, causing compiler crash.

The patch relaxes the threshold used for -Os compilation so we're always allowed
to inline memory copy functions.

This patch basically does the same thing as https://reviews.llvm.org/D158226 for
AMDGPU.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |    6 +
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    | 2717 +++++++++++++++++
 2 files changed, 2723 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f283af6fa07d3e..db69d50799e70b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -59,6 +59,12 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  // Always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather then generating calls to memset, mempcy or memmove.
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
+
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
new file mode 100644
index 00000000000000..5a996da7537d80
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -0,0 +1,2717 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s
+
+%struct.S = type { [32 x i32] }
+
+ at shared = addrspace(3) global %struct.S undef, align 4
+
+; Function Attrs: minsize
+define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 {
+; CHECK-LABEL: memcpy_p0_p0_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 {
+; CHECK-LABEL: memcpy_p1_p1_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 {
+; CHECK-LABEL: memcpy_p1_p4_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
+; CHECK-LABEL: memcpy_p5_p4_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1]
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:1
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:2
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:3
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:4
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:5
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:6
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:7
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:8
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:9
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:10
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:11
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:12
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:13
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:14
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:16
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:17
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:47
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:49
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:50
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:51
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:52
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:53
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:55
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:56
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:57
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:58
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:59
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:61
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:62
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:63
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:65
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:66
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:67
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:68
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:69
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:71
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:73
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:74
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:75
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:76
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:77
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:78
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:79
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:81
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:82
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:83
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:84
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:85
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:86
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:87
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:88
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:100
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:101
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:102
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:103
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:104
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:105
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:106
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:107
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:108
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:109
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:111
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:112
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:113
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:114
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:115
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:116
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:117
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:118
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
+; CHECK-LABEL: memcpy_p0_p5_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:18
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:21
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:23
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:26
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:27
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:28
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:29
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:30
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:31
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:32
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:33
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:34
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:35
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:36
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:38
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:40
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:41
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:42
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:43
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:44
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:45
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:46
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:47
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:48
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:49
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:50
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:51
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:52
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:53
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:54
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:57
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:58
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:59
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:60
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:61
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:62
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:63
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:64
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:65
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:66
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:67
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:68
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:69
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:70
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:71
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:80
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:81
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:82
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:83
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:84
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:85
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:86
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:87
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:88
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:89
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:91
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:94
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:98
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:101
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:102
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:103
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:104
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:105
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:106
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:107
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:108
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:111
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:114
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:118
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:121
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:124
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:126
+; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
+; CHECK-LABEL: memcpy_p3_p4_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
+; CHECK-LABEL: memcpy_p0_p3_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:127
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:126
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:125
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:127
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:126
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:124
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:122
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:121
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:123
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:121
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:118
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:119
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:117
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:118
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:115
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:114
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:116
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:114
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:112
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:111
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:110
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:111
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:108
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:109
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:107
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:108
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:104
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:107
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:103
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:104
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:102
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:101
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:103
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:100
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:102
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:101
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:98
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:97
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:98
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:95
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:94
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:96
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:94
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:92
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:91
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:93
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:90
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:89
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:91
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:88
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:89
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:87
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:86
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:88
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:85
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:84
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:87
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:86
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:83
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:85
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:84
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:82
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:81
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:83
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:79
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:82
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:81
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:78
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:79
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:77
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:76
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:78
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:75
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:74
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:77
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:76
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:75
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:74
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:72
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:71
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:73
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:70
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:69
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:72
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:71
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:68
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:70
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:69
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:67
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:67
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:63
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:64
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:62
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:61
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:63
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:60
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:62
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:61
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:58
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:60
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:57
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:56
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:58
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:55
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:54
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:53
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:52
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:51
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:53
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:50
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:49
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:52
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:51
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:50
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:49
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:47
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:46
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:48
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:45
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:44
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:47
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:46
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:43
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:44
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:42
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:41
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:39
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:38
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:37
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:36
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:35
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:34
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:37
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:36
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:33
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:34
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:32
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:31
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:33
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:29
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:31
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:27
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:27
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:23
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:24
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:22
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:21
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:23
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:20
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:19
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:21
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:18
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:19
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:16
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:8
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:9
+; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
+; CHECK-NEXT:    ds_read_u8 v8, v2 offset:11
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:12
+; CHECK-NEXT:    ds_read_u8 v10, v2 offset:13
+; CHECK-NEXT:    ds_read_u8 v11, v2 offset:14
+; CHECK-NEXT:    ds_read_u8 v12, v2 offset:15
+; CHECK-NEXT:    ds_read_u8 v13, v2
+; CHECK-NEXT:    ds_read_u8 v14, v2 offset:1
+; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
+; CHECK-NEXT:    ds_read_u8 v16, v2 offset:3
+; CHECK-NEXT:    ds_read_u8 v17, v2 offset:4
+; CHECK-NEXT:    ds_read_u8 v18, v2 offset:5
+; CHECK-NEXT:    ds_read_u8 v19, v2 offset:6
+; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:15
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:14
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:13
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:12
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:9
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:5
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:1
+; CHECK-NEXT:    flat_store_byte v[0:1], v13
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
+; CHECK-LABEL: memcpy_p0_p0_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v0, v[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v0 offset:46
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 {
+; CHECK-LABEL: memcpy_p1_p1_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 {
+; CHECK-LABEL: memcpy_p1_p4_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
+; CHECK-LABEL: memcpy_p5_p4_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x8
+; CHECK-NEXT:    s_load_dword s2, s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1]
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:1
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:2
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:3
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:4
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:5
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:6
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:7
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:8
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:9
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:10
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:11
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:12
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:13
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:14
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:15
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:16
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:17
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(18)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:47
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:48
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:49
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:50
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:51
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:52
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:53
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:55
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:56
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:57
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:58
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:59
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:60
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:61
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:62
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:63
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:65
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:66
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:67
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:68
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:69
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:71
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:73
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:74
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:75
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:76
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:77
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:78
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:79
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:81
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:82
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:83
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:84
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:85
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:86
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:87
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:88
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:100
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:101
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:102
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:103
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:104
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:105
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:106
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:107
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_waitcnt vmcnt(35)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(34)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_waitcnt vmcnt(33)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_waitcnt vmcnt(32)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_waitcnt vmcnt(31)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_waitcnt vmcnt(30)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_waitcnt vmcnt(29)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(28)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_waitcnt vmcnt(27)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:108
+; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:109
+; CHECK-NEXT:    global_load_ubyte v4, v0, s[0:1] offset:110
+; CHECK-NEXT:    global_load_ubyte v5, v0, s[0:1] offset:111
+; CHECK-NEXT:    global_load_ubyte v6, v0, s[0:1] offset:112
+; CHECK-NEXT:    global_load_ubyte v7, v0, s[0:1] offset:113
+; CHECK-NEXT:    global_load_ubyte v15, v0, s[0:1] offset:114
+; CHECK-NEXT:    global_load_ubyte v16, v0, s[0:1] offset:115
+; CHECK-NEXT:    global_load_ubyte v17, v0, s[0:1] offset:116
+; CHECK-NEXT:    global_load_ubyte v18, v0, s[0:1] offset:117
+; CHECK-NEXT:    global_load_ubyte v19, v0, s[0:1] offset:118
+; CHECK-NEXT:    global_load_ubyte v20, v0, s[0:1] offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    global_load_ubyte v8, v0, s[0:1] offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    global_load_ubyte v9, v0, s[0:1] offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    global_load_ubyte v10, v0, s[0:1] offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    global_load_ubyte v11, v0, s[0:1] offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    global_load_ubyte v12, v0, s[0:1] offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    global_load_ubyte v13, v0, s[0:1] offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(36)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    global_load_ubyte v14, v0, s[0:1] offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    global_load_ubyte v21, v0, s[0:1] offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    s_waitcnt vmcnt(26)
+; CHECK-NEXT:    buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_waitcnt vmcnt(25)
+; CHECK-NEXT:    buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_waitcnt vmcnt(24)
+; CHECK-NEXT:    buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_waitcnt vmcnt(23)
+; CHECK-NEXT:    buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_waitcnt vmcnt(22)
+; CHECK-NEXT:    buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_waitcnt vmcnt(21)
+; CHECK-NEXT:    buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_waitcnt vmcnt(20)
+; CHECK-NEXT:    buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_waitcnt vmcnt(19)
+; CHECK-NEXT:    buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
+; CHECK-LABEL: memcpy_p0_p5_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[8:9], s[0:1]
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x8
+; CHECK-NEXT:    s_add_u32 s8, s8, s7
+; CHECK-NEXT:    s_addc_u32 s9, s9, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    s_waitcnt vmcnt(17)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:2
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:3
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:4
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:5
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:6
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:7
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:8
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:9
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:10
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:11
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:12
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:13
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:14
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:15
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:16
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:17
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:18
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:20
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:21
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:22
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:23
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:24
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:25
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:26
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:27
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:28
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:29
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:30
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:31
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:32
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:33
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:34
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:35
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:36
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:38
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:40
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:41
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:42
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:43
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:44
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:45
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:46
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:47
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:48
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:49
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:50
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:51
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:52
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:53
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:54
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:57
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:58
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:59
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:60
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:61
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:62
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:63
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:64
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:65
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:66
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:67
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:68
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:69
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:70
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:71
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:72
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:74
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:75
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:76
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:77
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:78
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:79
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:80
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:81
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:82
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:83
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:84
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:85
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:86
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:87
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:88
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:89
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:91
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:94
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:98
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:101
+; CHECK-NEXT:    buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
+; CHECK-NEXT:    buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
+; CHECK-NEXT:    buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
+; CHECK-NEXT:    buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
+; CHECK-NEXT:    buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
+; CHECK-NEXT:    buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
+; CHECK-NEXT:    buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
+; CHECK-NEXT:    buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
+; CHECK-NEXT:    buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
+; CHECK-NEXT:    buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:102
+; CHECK-NEXT:    buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:103
+; CHECK-NEXT:    buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:104
+; CHECK-NEXT:    buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:105
+; CHECK-NEXT:    buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:106
+; CHECK-NEXT:    buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:107
+; CHECK-NEXT:    buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:108
+; CHECK-NEXT:    buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:111
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:114
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v13 offset:118
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:121
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:124
+; CHECK-NEXT:    flat_store_byte v[0:1], v20 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:126
+; CHECK-NEXT:    flat_store_byte v[0:1], v21 offset:127
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
+; CHECK-LABEL: memcpy_p3_p4_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v24, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
+; CHECK-NEXT:    global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
+; CHECK-NEXT:    global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
+; CHECK-NEXT:    global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
+; CHECK-NEXT:    global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v24, s[0:1]
+; CHECK-NEXT:    s_waitcnt vmcnt(5)
+; CHECK-NEXT:    ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
+; CHECK-NEXT:    s_waitcnt vmcnt(4)
+; CHECK-NEXT:    ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
+; CHECK-NEXT:    s_waitcnt vmcnt(3)
+; CHECK-NEXT:    ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
+; CHECK-NEXT:    s_waitcnt vmcnt(2)
+; CHECK-NEXT:    ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
+; CHECK-LABEL: memcpy_p0_p3_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:127
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:126
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:125
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:124
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:127
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:126
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:123
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:125
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:124
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:122
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:121
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:123
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:120
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:119
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:122
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:121
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:118
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:120
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:119
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:117
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:116
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:118
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:115
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:114
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:117
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:116
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:113
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:115
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:114
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:112
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:111
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:113
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:110
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:109
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:112
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:111
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:108
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:110
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:109
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:107
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:106
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:108
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:105
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:104
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:107
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:106
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:103
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:105
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:104
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:102
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:101
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:103
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:100
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:99
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:102
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:101
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:98
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:100
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:99
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:97
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:96
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:98
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:95
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:94
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:97
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:96
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:93
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:95
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:94
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:92
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:91
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:93
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:90
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:89
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:92
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:91
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:88
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:90
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:89
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:87
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:86
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:88
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:85
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:84
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:87
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:86
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:83
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:85
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:84
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:82
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:81
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:83
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:80
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:79
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:82
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:81
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:78
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:80
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:79
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:77
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:76
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:78
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:75
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:74
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:77
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:76
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:73
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:75
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:74
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:72
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:71
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:73
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:70
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:69
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:72
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:71
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:68
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:70
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:69
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:67
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:66
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:68
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:65
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:67
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:66
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:63
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:65
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:64
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:62
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:61
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:63
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:60
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:59
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:62
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:61
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:58
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:60
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:59
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:57
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:56
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:58
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:55
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:54
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:57
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:56
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:53
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:55
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:54
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:52
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:51
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:53
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:50
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:49
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:52
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:51
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:50
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:49
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:47
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:46
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:48
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:45
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:44
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:47
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:46
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:43
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:44
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:42
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:41
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:43
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:40
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:39
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:41
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:38
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:40
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:39
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:37
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:36
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:35
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:34
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:37
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:36
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:33
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:34
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:32
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:31
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:33
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:30
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:29
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:31
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:28
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:30
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:29
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:27
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:26
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:25
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:27
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:26
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:23
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:24
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:22
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:21
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:23
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:20
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:19
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:21
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:18
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:20
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:19
+; CHECK-NEXT:    ds_read_u8 v3, v2 offset:16
+; CHECK-NEXT:    ds_read_u8 v5, v2 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    ds_read_u8 v4, v2 offset:8
+; CHECK-NEXT:    ds_read_u8 v6, v2 offset:9
+; CHECK-NEXT:    ds_read_u8 v7, v2 offset:10
+; CHECK-NEXT:    ds_read_u8 v8, v2 offset:11
+; CHECK-NEXT:    ds_read_u8 v9, v2 offset:12
+; CHECK-NEXT:    ds_read_u8 v10, v2 offset:13
+; CHECK-NEXT:    ds_read_u8 v11, v2 offset:14
+; CHECK-NEXT:    ds_read_u8 v12, v2 offset:15
+; CHECK-NEXT:    ds_read_u8 v13, v2
+; CHECK-NEXT:    ds_read_u8 v14, v2 offset:1
+; CHECK-NEXT:    ds_read_u8 v15, v2 offset:2
+; CHECK-NEXT:    ds_read_u8 v16, v2 offset:3
+; CHECK-NEXT:    ds_read_u8 v17, v2 offset:4
+; CHECK-NEXT:    ds_read_u8 v18, v2 offset:5
+; CHECK-NEXT:    ds_read_u8 v19, v2 offset:6
+; CHECK-NEXT:    ds_read_u8 v2, v2 offset:7
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v5 offset:17
+; CHECK-NEXT:    flat_store_byte v[0:1], v3 offset:16
+; CHECK-NEXT:    flat_store_byte v[0:1], v12 offset:15
+; CHECK-NEXT:    flat_store_byte v[0:1], v11 offset:14
+; CHECK-NEXT:    flat_store_byte v[0:1], v10 offset:13
+; CHECK-NEXT:    flat_store_byte v[0:1], v9 offset:12
+; CHECK-NEXT:    flat_store_byte v[0:1], v8 offset:11
+; CHECK-NEXT:    flat_store_byte v[0:1], v7 offset:10
+; CHECK-NEXT:    flat_store_byte v[0:1], v6 offset:9
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_store_byte v[0:1], v2 offset:7
+; CHECK-NEXT:    flat_store_byte v[0:1], v19 offset:6
+; CHECK-NEXT:    flat_store_byte v[0:1], v18 offset:5
+; CHECK-NEXT:    flat_store_byte v[0:1], v17 offset:4
+; CHECK-NEXT:    flat_store_byte v[0:1], v16 offset:3
+; CHECK-NEXT:    flat_store_byte v[0:1], v15 offset:2
+; CHECK-NEXT:    flat_store_byte v[0:1], v14 offset:1
+; CHECK-NEXT:    flat_store_byte v[0:1], v13
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
+
+attributes #0 = { minsize }
+attributes #1 = { optsize }
+attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }



More information about the llvm-commits mailing list