[llvm] [AMDGPU][CodeGen] Improve handling of memcpy for -Os/-Oz compilations (PR #87632)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sat Apr 6 12:37:13 PDT 2024


================
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s
+
+%struct.S = type { [32 x i32] }
+
+ at shared = local_unnamed_addr addrspace(3) global %struct.S undef, align 4
+
+define dso_local void @_Z12copy_genericPvPKv(ptr nocapture noundef writeonly %dest, ptr nocapture noundef readonly %src) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z12copy_genericPvPKv:
+; CHECK:       _Z12copy_genericPvPKv$local:
+; CHECK-NEXT:    .type _Z12copy_genericPvPKv$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:46
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(47) %dest, ptr noundef nonnull align 1 dereferenceable(47) %src, i64 47, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
+
+define dso_local amdgpu_kernel void @_Z11copy_globalPvS_(ptr addrspace(1) nocapture noundef writeonly %dest.coerce, ptr addrspace(1) nocapture noundef readonly %src.coerce) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z11copy_globalPvS_:
+; CHECK:       _Z11copy_globalPvS_$local:
+; CHECK-NEXT:    .type _Z11copy_globalPvS_$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1]
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:1
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:2
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:3
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:4
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:5
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:6
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:7
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:8
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:9
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:10
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:11
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:12
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:13
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:14
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:15
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:16
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:17
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:18
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:19
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:20
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:21
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:22
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:23
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:24
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:25
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:26
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:27
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:28
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:29
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:30
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:31
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:32
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:33
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:34
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:35
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:36
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:37
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:38
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:39
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:40
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:41
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:42
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:43
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:44
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:45
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[2:3] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_byte v0, v1, s[0:1] offset:46
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(47) %dest.coerce, ptr addrspace(1) noundef align 1 dereferenceable(47) %src.coerce, i64 47, i1 false)
+  ret void
+}
+
+define dso_local amdgpu_kernel void @_Z20copy_param_to_globalP1SS_(ptr addrspace(1) nocapture noundef writeonly %global.coerce, ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z20copy_param_to_globalP1SS_:
+; CHECK:       _Z20copy_param_to_globalP1SS_$local:
+; CHECK-NEXT:    .type _Z20copy_param_to_globalP1SS_$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx4 s[20:23], s[0:1], 0x9c
+; CHECK-NEXT:    s_load_dwordx2 s[28:29], s[0:1], 0x24
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x2c
+; CHECK-NEXT:    s_load_dwordx8 s[12:19], s[0:1], 0x4c
+; CHECK-NEXT:    s_load_dwordx4 s[24:27], s[0:1], 0x8c
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    v_mov_b32_e32 v1, s21
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
+; CHECK-NEXT:    v_mov_b32_e32 v3, s23
+; CHECK-NEXT:    s_load_dwordx4 s[20:23], s[0:1], 0x7c
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x6c
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:112
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s24
+; CHECK-NEXT:    v_mov_b32_e32 v1, s25
+; CHECK-NEXT:    v_mov_b32_e32 v2, s26
+; CHECK-NEXT:    v_mov_b32_e32 v3, s27
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:96
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    v_mov_b32_e32 v1, s21
+; CHECK-NEXT:    v_mov_b32_e32 v2, s22
+; CHECK-NEXT:    v_mov_b32_e32 v3, s23
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:80
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-NEXT:    v_mov_b32_e32 v3, s3
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:64
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    v_mov_b32_e32 v2, s18
+; CHECK-NEXT:    v_mov_b32_e32 v3, s19
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:48
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:32
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29] offset:16
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[28:29]
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noundef align 4 dereferenceable(128) %global.coerce, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+define dso_local amdgpu_kernel void @_Z19copy_param_to_localPU3AS51SS_(ptr addrspace(5) nocapture noundef writeonly %local, ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z19copy_param_to_localPU3AS51SS_:
+; CHECK:       _Z19copy_param_to_localPU3AS51SS_$local:
+; CHECK-NEXT:    .type _Z19copy_param_to_localPU3AS51SS_$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; CHECK-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; CHECK-NEXT:    s_mov_b32 s38, -1
+; CHECK-NEXT:    s_load_dwordx8 s[20:27], s[0:1], 0x88
+; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x24
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x28
+; CHECK-NEXT:    s_load_dwordx8 s[12:19], s[0:1], 0x48
+; CHECK-NEXT:    s_mov_b32 s39, 0xe00000
+; CHECK-NEXT:    s_add_u32 s36, s36, s3
+; CHECK-NEXT:    s_addc_u32 s37, s37, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s27
+; CHECK-NEXT:    v_mov_b32_e32 v1, s2
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:124
+; CHECK-NEXT:    v_mov_b32_e32 v0, s26
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:120
+; CHECK-NEXT:    v_mov_b32_e32 v0, s25
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:116
+; CHECK-NEXT:    v_mov_b32_e32 v0, s24
+; CHECK-NEXT:    s_load_dwordx8 s[24:31], s[0:1], 0x68
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:112
+; CHECK-NEXT:    v_mov_b32_e32 v0, s23
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:108
+; CHECK-NEXT:    v_mov_b32_e32 v0, s22
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:104
+; CHECK-NEXT:    v_mov_b32_e32 v0, s21
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:100
+; CHECK-NEXT:    v_mov_b32_e32 v0, s20
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s31
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:92
+; CHECK-NEXT:    v_mov_b32_e32 v0, s30
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:88
+; CHECK-NEXT:    v_mov_b32_e32 v0, s29
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:84
+; CHECK-NEXT:    v_mov_b32_e32 v0, s28
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:80
+; CHECK-NEXT:    v_mov_b32_e32 v0, s27
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:76
+; CHECK-NEXT:    v_mov_b32_e32 v0, s26
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:72
+; CHECK-NEXT:    v_mov_b32_e32 v0, s25
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:68
+; CHECK-NEXT:    v_mov_b32_e32 v0, s24
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:64
+; CHECK-NEXT:    v_mov_b32_e32 v0, s19
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:60
+; CHECK-NEXT:    v_mov_b32_e32 v0, s18
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:56
+; CHECK-NEXT:    v_mov_b32_e32 v0, s17
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:52
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:48
+; CHECK-NEXT:    v_mov_b32_e32 v0, s15
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:44
+; CHECK-NEXT:    v_mov_b32_e32 v0, s14
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:40
+; CHECK-NEXT:    v_mov_b32_e32 v0, s13
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:36
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:32
+; CHECK-NEXT:    v_mov_b32_e32 v0, s11
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:28
+; CHECK-NEXT:    v_mov_b32_e32 v0, s10
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:24
+; CHECK-NEXT:    v_mov_b32_e32 v0, s9
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:20
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v0, s7
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:12
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:8
+; CHECK-NEXT:    v_mov_b32_e32 v0, s5
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen offset:4
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[36:39], 0 offen
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noundef align 4 dereferenceable(128) %local, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+define dso_local void @_Z21copy_local_to_genericP1SPU3AS5S_(ptr nocapture noundef writeonly %generic, ptr addrspace(5) nocapture noundef readonly %src) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z21copy_local_to_genericP1SPU3AS5S_:
+; CHECK:       _Z21copy_local_to_genericP1SPU3AS5S_$local:
+; CHECK-NEXT:    .type _Z21copy_local_to_genericP1SPU3AS5S_$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:124
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:120
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:116
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:112
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:112
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:108
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:104
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:100
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:96
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:96
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:92
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:88
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:84
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:80
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:80
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:76
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:72
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:68
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:64
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:64
+; CHECK-NEXT:    buffer_load_dword v3, v2, s[0:3], 0 offen offset:16
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    buffer_load_dword v4, v2, s[0:3], 0 offen offset:20
+; CHECK-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen offset:24
+; CHECK-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    buffer_load_dword v7, v2, s[0:3], 0 offen offset:32
+; CHECK-NEXT:    buffer_load_dword v8, v2, s[0:3], 0 offen offset:36
+; CHECK-NEXT:    buffer_load_dword v9, v2, s[0:3], 0 offen offset:40
+; CHECK-NEXT:    buffer_load_dword v10, v2, s[0:3], 0 offen offset:44
+; CHECK-NEXT:    buffer_load_dword v11, v2, s[0:3], 0 offen offset:48
+; CHECK-NEXT:    buffer_load_dword v12, v2, s[0:3], 0 offen offset:52
+; CHECK-NEXT:    buffer_load_dword v13, v2, s[0:3], 0 offen offset:56
+; CHECK-NEXT:    buffer_load_dword v14, v2, s[0:3], 0 offen offset:60
+; CHECK-NEXT:    buffer_load_dword v15, v2, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dword v16, v2, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_load_dword v17, v2, s[0:3], 0 offen offset:8
+; CHECK-NEXT:    buffer_load_dword v18, v2, s[0:3], 0 offen offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[11:14] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[7:10] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[3:6] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p5.i64(ptr noundef nonnull align 4 dereferenceable(128) %generic, ptr addrspace(5) noundef align 4 dereferenceable(128) %src, i64 128, i1 false)
+  ret void
+}
+
+define dso_local amdgpu_kernel void @_Z20copy_param_to_shared1S(ptr addrspace(4) nocapture noundef readonly byref(%struct.S) align 4 %0) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z20copy_param_to_shared1S:
+; CHECK:       _Z20copy_param_to_shared1S$local:
+; CHECK-NEXT:    .type _Z20copy_param_to_shared1S$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x84
+; CHECK-NEXT:    s_load_dwordx8 s[12:19], s[0:1], 0x64
+; CHECK-NEXT:    v_mov_b32_e32 v8, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    v_mov_b32_e32 v5, s5
+; CHECK-NEXT:    v_mov_b32_e32 v6, s6
+; CHECK-NEXT:    v_mov_b32_e32 v7, s7
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x44
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:112
+; CHECK-NEXT:    ds_write_b128 v8, v[4:7] offset:96
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    v_mov_b32_e32 v2, s18
+; CHECK-NEXT:    v_mov_b32_e32 v3, s19
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:80
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    s_load_dwordx8 s[12:19], s[0:1], 0x24
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
+; CHECK-NEXT:    v_mov_b32_e32 v2, s10
+; CHECK-NEXT:    v_mov_b32_e32 v3, s11
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:48
+; CHECK-NEXT:    v_mov_b32_e32 v0, s4
+; CHECK-NEXT:    v_mov_b32_e32 v1, s5
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:32
+; CHECK-NEXT:    v_mov_b32_e32 v0, s16
+; CHECK-NEXT:    v_mov_b32_e32 v1, s17
+; CHECK-NEXT:    v_mov_b32_e32 v2, s18
+; CHECK-NEXT:    v_mov_b32_e32 v3, s19
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3] offset:16
+; CHECK-NEXT:    v_mov_b32_e32 v0, s12
+; CHECK-NEXT:    v_mov_b32_e32 v1, s13
+; CHECK-NEXT:    v_mov_b32_e32 v2, s14
+; CHECK-NEXT:    v_mov_b32_e32 v3, s15
+; CHECK-NEXT:    ds_write_b128 v8, v[0:3]
+; CHECK-NEXT:    s_endpgm
+entry:
+  tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noundef align 4 dereferenceable(128) @shared, ptr addrspace(4) noundef align 4 dereferenceable(128) %0, i64 128, i1 false)
+  ret void
+}
+
+define dso_local void @_Z22copy_shared_to_genericP1S(ptr nocapture noundef writeonly %generic) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z22copy_shared_to_genericP1S:
+; CHECK:       _Z22copy_shared_to_genericP1S$local:
+; CHECK-NEXT:    .type _Z22copy_shared_to_genericP1S$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b32_e32 v14, shared at abs32@lo
+; CHECK-NEXT:    ds_read_b128 v[2:5], v14 offset:112
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:112
+; CHECK-NEXT:    ds_read_b128 v[2:5], v14 offset:96
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:96
+; CHECK-NEXT:    ds_read_b128 v[2:5], v14 offset:80
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:80
+; CHECK-NEXT:    ds_read_b128 v[2:5], v14 offset:64
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:64
+; CHECK-NEXT:    ds_read_b128 v[2:5], v14 offset:48
+; CHECK-NEXT:    ds_read_b128 v[6:9], v14 offset:32
+; CHECK-NEXT:    ds_read_b128 v[10:13], v14 offset:16
+; CHECK-NEXT:    ds_read_b128 v[14:17], v14
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5] offset:48
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[6:9] offset:32
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[10:13] offset:16
+; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[14:17]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p3.i64(ptr noundef nonnull align 4 dereferenceable(128) %generic, ptr addrspace(3) noundef align 4 dereferenceable(128) @shared, i64 128, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg)
+
+declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg)
+
+declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg)
+
+declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg)
+
+declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg)
+
+declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg)
+
+attributes #0 = { minsize }
----------------
arsenm wrote:

Should you also test optsize? 

https://github.com/llvm/llvm-project/pull/87632


More information about the llvm-commits mailing list