[llvm] [AMDGPU][CodeGen] Improve handling of memcpy for -Os/-Oz compilations (PR #87632)

Shilei Tian via llvm-commits llvm-commits at lists.llvm.org
Sun Apr 7 08:07:06 PDT 2024


https://github.com/shiltian updated https://github.com/llvm/llvm-project/pull/87632

>From 91c79aa192480bce5cab41d60b5f725723f36c85 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i at tianshilei.me>
Date: Sun, 7 Apr 2024 11:06:54 -0400
Subject: [PATCH] [AMDGPU][CodeGen] Improve handling of memcpy for -Os/-Oz
 compilations

We had some instances when LLVM would not inline fixed-count memcpy and ended up
attempting to lower it a a libcall, which would not work on AMDGPU as the
address space doesn't meet the requirement, causing compiler crash.

The patch relaxes the threshold used for -Os compilation so we're always allowed
to inline memory copy functions.

This patch basically does the same thing as https://reviews.llvm.org/D158226 for
AMDGPU.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   6 +
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    | 973 ++++++++++++++++++
 2 files changed, 979 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f283af6fa07d3e..db69d50799e70b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -59,6 +59,12 @@ unsigned AMDGPUTargetLowering::numBitsSigned(SDValue Op, SelectionDAG &DAG) {
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  // Always lower memset, memcpy, and memmove intrinsics to load/store
+  // instructions, rather then generating calls to memset, mempcy or memmove.
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = ~0U;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = ~0U;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = ~0U;
+
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
new file mode 100644
index 00000000000000..1a293e1fa95b26
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -0,0 +1,973 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s
+
+%struct.S = type { [32 x i32] }
+
+ at shared = addrspace(3) global %struct.S undef, align 4
+
+; Function Attrs: minsize
+define void @_Z12copy_genericPvPKv__minsize(ptr nocapture noundef writeonly %dest, ptr nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: _Z12copy_genericPvPKv__minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:46
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(47) %dest, ptr noundef nonnull align 1 dereferenceable(47) %src, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: minsize
+define amdgpu_kernel void @_Z11copy_globalPvS__minsize(ptr addrspace(1) nocapture noundef writeonly %dest.coerce, ptr addrspace(1) nocapture noundef readonly %src.coerce) #0 {
+; CHECK-LABEL: _Z11copy_globalPvS__minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(47) %dest.coerce, ptr addrspace(1) noundef align 1 dereferenceable(47) %src.coerce, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @_Z20copy_param_to_globalP1SS__minsize(ptr addrspace(1) nocapture noundef writeonly %global.coerce, ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) #0 {
+; CHECK-LABEL: _Z20copy_param_to_globalP1SS__minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x78
+; CHECK-NEXT:    s_load_dwordx2 s[28:29], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; CHECK-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x28
+; CHECK-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x68
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x58
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x48
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:112
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s24
+; CHECK-NEXT:    v_mov_b32_e32 v1, s25
+; CHECK-NEXT:    v_mov_b32_e32 v2, s26
+; CHECK-NEXT:    v_mov_b32_e32 v3, s27
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:96
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:80
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:64
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    v_mov_b32_e32 v1, s21
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
+; CHECK-NEXT:    v_mov_b32_e32 v3, s23
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:48
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    v_mov_b32_e32 v2, s18
+; CHECK-NEXT:    v_mov_b32_e32 v3, s19
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:32
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:16
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29]
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef align 4 dereferenceable(128) %global.coerce, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @_Z19copy_param_to_localPU3AS51SS__minsize(ptr addrspace(5) nocapture noundef writeonly %local, ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) #0 {
+; CHECK-LABEL: _Z19copy_param_to_localPU3AS51SS__minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[0:1]
+; CHECK-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x64
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x4
+; CHECK-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; CHECK-NEXT:    s_add_u32 s36, s36, s7
+; CHECK-NEXT:    s_addc_u32 s37, s37, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s31
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:124
+; CHECK-NEXT:    v_mov_b32_e32 v0, s30
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:120
+; CHECK-NEXT:    v_mov_b32_e32 v0, s29
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:116
+; CHECK-NEXT:    v_mov_b32_e32 v0, s28
+; CHECK-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x44
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:112
+; CHECK-NEXT:    v_mov_b32_e32 v0, s27
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:108
+; CHECK-NEXT:    v_mov_b32_e32 v0, s26
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:104
+; CHECK-NEXT:    v_mov_b32_e32 v0, s25
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:100
+; CHECK-NEXT:    v_mov_b32_e32 v0, s24
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s7
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:92
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:88
+; CHECK-NEXT:    v_mov_b32_e32 v0, s5
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:84
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:80
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:76
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:72
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:68
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:64
+; CHECK-NEXT:    v_mov_b32_e32 v0, s23
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:60
+; CHECK-NEXT:    v_mov_b32_e32 v0, s22
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:56
+; CHECK-NEXT:    v_mov_b32_e32 v0, s21
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:52
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:48
+; CHECK-NEXT:    v_mov_b32_e32 v0, s19
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:44
+; CHECK-NEXT:    v_mov_b32_e32 v0, s18
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:40
+; CHECK-NEXT:    v_mov_b32_e32 v0, s17
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:36
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:32
+; CHECK-NEXT:    v_mov_b32_e32 v0, s15
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:28
+; CHECK-NEXT:    v_mov_b32_e32 v0, s14
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:24
+; CHECK-NEXT:    v_mov_b32_e32 v0, s13
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:20
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v0, s11
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:12
+; CHECK-NEXT:    v_mov_b32_e32 v0, s10
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:8
+; CHECK-NEXT:    v_mov_b32_e32 v0, s9
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(128) %local, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define void @_Z21copy_local_to_genericP1SPU3AS5S__minsize(ptr nocapture noundef writeonly %generic, ptr addrspace(5) nocapture noundef readonly %src) #0 {
+; CHECK-LABEL: _Z21copy_local_to_genericP1SPU3AS5S__minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:112
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:96
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:80
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:64
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[11:14] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr noundef nonnull align 4 dereferenceable(128) %generic, ptr addrspace(5) noundef align 4 dereferenceable(128) %src, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define amdgpu_kernel void @_Z20copy_param_to_shared1S_minsize(ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) #0 {
+; CHECK-LABEL: _Z20copy_param_to_shared1S_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x60
+; CHECK-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x40
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v4, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    v_mov_b32_e32 v5, s9
+; CHECK-NEXT:    v_mov_b32_e32 v6, s10
+; CHECK-NEXT:    v_mov_b32_e32 v7, s11
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:112
+; CHECK-NEXT:    ds_write_b128 v8, v[4:7] offset:96
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    v_mov_b32_e32 v1, s21
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
+; CHECK-NEXT:    v_mov_b32_e32 v3, s23
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:80
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    v_mov_b32_e32 v2, s18
+; CHECK-NEXT:    v_mov_b32_e32 v3, s19
+; CHECK-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:48
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:32
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3]
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef align 4 dereferenceable(128) @shared, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: minsize
+define void @_Z22copy_shared_to_genericP1S_minsize(ptr nocapture noundef writeonly %generic) #0 {
+; CHECK-LABEL: _Z22copy_shared_to_genericP1S_minsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ds_read_b128 v[2:5], v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_trap 2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p3.i64(ptr noundef nonnull align 4 dereferenceable(128) %generic, ptr addrspace(3) noundef align 4 dereferenceable(128) @shared, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define void @_Z12copy_genericPvPKv__optsize(ptr nocapture noundef writeonly %dest, ptr nocapture noundef readonly %src) #2 {
+; CHECK-LABEL: _Z12copy_genericPvPKv__optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:46
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(47) %dest, ptr noundef nonnull align 1 dereferenceable(47) %src, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @_Z11copy_globalPvS__optsize(ptr addrspace(1) nocapture noundef writeonly %dest.coerce, ptr addrspace(1) nocapture noundef readonly %src.coerce) #2 {
+; CHECK-LABEL: _Z11copy_globalPvS__optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
+; CHECK-NEXT:    global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(47) %dest.coerce, ptr addrspace(1) noundef align 1 dereferenceable(47) %src.coerce, i64 47, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @_Z20copy_param_to_globalP1SS__optsize(ptr addrspace(1) nocapture noundef writeonly %global.coerce, ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) #2 {
+; CHECK-LABEL: _Z20copy_param_to_globalP1SS__optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x78
+; CHECK-NEXT:    s_load_dwordx2 s[28:29], s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x8
+; CHECK-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x28
+; CHECK-NEXT:    s_load_dwordx4 s[24:27], s[4:5], 0x68
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x58
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x48
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:112
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s24
+; CHECK-NEXT:    v_mov_b32_e32 v1, s25
+; CHECK-NEXT:    v_mov_b32_e32 v2, s26
+; CHECK-NEXT:    v_mov_b32_e32 v3, s27
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:96
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:80
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:64
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    v_mov_b32_e32 v1, s21
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
+; CHECK-NEXT:    v_mov_b32_e32 v3, s23
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:48
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    v_mov_b32_e32 v2, s18
+; CHECK-NEXT:    v_mov_b32_e32 v3, s19
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:32
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:16
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29]
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef align 4 dereferenceable(128) %global.coerce, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @_Z19copy_param_to_localPU3AS51SS__optsize(ptr addrspace(5) nocapture noundef writeonly %local, ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) #2 {
+; CHECK-LABEL: _Z19copy_param_to_localPU3AS51SS__optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[2:3]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[0:1]
+; CHECK-NEXT:    s_load_dwordx8 s[24:31], s[4:5], 0x64
+; CHECK-NEXT:    s_load_dword s0, s[4:5], 0x0
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x4
+; CHECK-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x24
+; CHECK-NEXT:    s_add_u32 s36, s36, s7
+; CHECK-NEXT:    s_addc_u32 s37, s37, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s31
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:124
+; CHECK-NEXT:    v_mov_b32_e32 v0, s30
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:120
+; CHECK-NEXT:    v_mov_b32_e32 v0, s29
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:116
+; CHECK-NEXT:    v_mov_b32_e32 v0, s28
+; CHECK-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x44
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:112
+; CHECK-NEXT:    v_mov_b32_e32 v0, s27
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:108
+; CHECK-NEXT:    v_mov_b32_e32 v0, s26
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:104
+; CHECK-NEXT:    v_mov_b32_e32 v0, s25
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:100
+; CHECK-NEXT:    v_mov_b32_e32 v0, s24
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s7
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:92
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:88
+; CHECK-NEXT:    v_mov_b32_e32 v0, s5
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:84
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:80
+; CHECK-NEXT:    v_mov_b32_e32 v0, s3
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:76
+; CHECK-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:72
+; CHECK-NEXT:    v_mov_b32_e32 v0, s1
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:68
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:64
+; CHECK-NEXT:    v_mov_b32_e32 v0, s23
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:60
+; CHECK-NEXT:    v_mov_b32_e32 v0, s22
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:56
+; CHECK-NEXT:    v_mov_b32_e32 v0, s21
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:52
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:48
+; CHECK-NEXT:    v_mov_b32_e32 v0, s19
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:44
+; CHECK-NEXT:    v_mov_b32_e32 v0, s18
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:40
+; CHECK-NEXT:    v_mov_b32_e32 v0, s17
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:36
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:32
+; CHECK-NEXT:    v_mov_b32_e32 v0, s15
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:28
+; CHECK-NEXT:    v_mov_b32_e32 v0, s14
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:24
+; CHECK-NEXT:    v_mov_b32_e32 v0, s13
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:20
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v0, s11
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:12
+; CHECK-NEXT:    v_mov_b32_e32 v0, s10
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:8
+; CHECK-NEXT:    v_mov_b32_e32 v0, s9
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(128) %local, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define void @_Z21copy_local_to_genericP1SPU3AS5S__optsize(ptr nocapture noundef writeonly %generic, ptr addrspace(5) nocapture noundef readonly %src) #2 {
+; CHECK-LABEL: _Z21copy_local_to_genericP1SPU3AS5S__optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:112
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:96
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:80
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:64
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[11:14] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr noundef nonnull align 4 dereferenceable(128) %generic, ptr addrspace(5) noundef align 4 dereferenceable(128) %src, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define amdgpu_kernel void @_Z20copy_param_to_shared1S_optsize(ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) #2 {
+; CHECK-LABEL: _Z20copy_param_to_shared1S_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x60
+; CHECK-NEXT:    s_load_dwordx8 s[16:23], s[4:5], 0x40
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v4, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    v_mov_b32_e32 v5, s9
+; CHECK-NEXT:    v_mov_b32_e32 v6, s10
+; CHECK-NEXT:    v_mov_b32_e32 v7, s11
+; CHECK-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:112
+; CHECK-NEXT:    ds_write_b128 v8, v[4:7] offset:96
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    v_mov_b32_e32 v1, s21
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
+; CHECK-NEXT:    v_mov_b32_e32 v3, s23
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:80
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    v_mov_b32_e32 v2, s18
+; CHECK-NEXT:    v_mov_b32_e32 v3, s19
+; CHECK-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x0
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:48
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:32
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3]
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef align 4 dereferenceable(128) @shared, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: optsize
+define void @_Z22copy_shared_to_genericP1S_optsize(ptr nocapture noundef writeonly %generic) #2 {
+; CHECK-LABEL: _Z22copy_shared_to_genericP1S_optsize:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    ds_read_b128 v[2:5], v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:112
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:96
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:80
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:64
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
+; CHECK-NEXT:    s_trap 2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p3.i64(ptr noundef nonnull align 4 dereferenceable(128) %generic, ptr addrspace(3) noundef align 4 dereferenceable(128) @shared, i64 128, i1 false)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #1
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #1
+
+attributes #0 = { minsize }
+attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
+attributes #2 = { optsize }



More information about the llvm-commits mailing list