[llvm] [AMDGPU][CodeGen] Improve handling of memcpy for -Os/-Oz compilations (PR #87632)

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Sat Apr 6 12:37:12 PDT 2024


================
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx908 %s -o - | FileCheck %s
+
+%struct.S = type { [32 x i32] }
+
+ at shared = local_unnamed_addr addrspace(3) global %struct.S undef, align 4
+
+define dso_local void @_Z12copy_genericPvPKv(ptr nocapture noundef writeonly %dest, ptr nocapture noundef readonly %src) local_unnamed_addr #0 {
+; CHECK-LABEL: _Z12copy_genericPvPKv:
+; CHECK:       _Z12copy_genericPvPKv$local:
+; CHECK-NEXT:    .type _Z12copy_genericPvPKv$local, at function
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:46
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:46
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:45
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:45
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:44
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:44
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:43
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:43
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:42
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:42
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:41
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:41
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:40
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:40
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:39
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:39
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:38
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:38
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:37
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:37
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:36
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:36
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:35
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:35
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:34
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:34
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:33
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:33
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:32
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:32
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:31
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:31
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:30
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:30
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:29
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:29
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:28
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:28
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:27
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:27
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:26
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:26
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:25
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:25
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:24
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:24
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:23
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:23
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:22
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:22
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:21
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:21
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:20
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:20
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:19
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:19
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:18
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:18
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:17
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:17
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:16
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:16
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:15
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:15
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:14
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:14
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:13
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:13
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:12
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:12
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:11
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:11
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:10
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:10
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:9
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:9
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:8
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:8
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:7
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:7
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:6
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:6
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:5
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:5
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:4
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:4
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:3
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:3
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:2
+; CHECK-NEXT:    flat_load_ubyte v4, v[2:3] offset:1
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v4 offset:1
+; CHECK-NEXT:    flat_load_ubyte v2, v[2:3]
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[0:1], v2
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(47) %dest, ptr noundef nonnull align 1 dereferenceable(47) %src, i64 47, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0
+
+define dso_local amdgpu_kernel void @_Z11copy_globalPvS_(ptr addrspace(1) nocapture noundef writeonly %dest.coerce, ptr addrspace(1) nocapture noundef readonly %src.coerce) local_unnamed_addr #0 {
----------------
arsenm wrote:

Drop all the dso_local/local_unnamed_addr 

https://github.com/llvm/llvm-project/pull/87632


More information about the llvm-commits mailing list