[llvm] [AMDGPU][NFC] Pre-commit memcpy test with complex constant length (PR #182170)

via llvm-commits llvm-commits at lists.llvm.org
Thu Feb 19 05:59:27 PST 2026


https://github.com/LU-JOHN updated https://github.com/llvm/llvm-project/pull/182170

>From 82e0e6853c914f880e9bc9344d299c9c2e84b384 Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Wed, 18 Feb 2026 16:26:53 -0600
Subject: [PATCH 1/2] Test memcpy lowering with complex constant length

Signed-off-by: John Lu <John.Lu at amd.com>
---
 .../CodeGen/AMDGPU/memcpy_const_compare.ll    | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll

diff --git a/llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll b/llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll
new file mode 100644
index 0000000000000..b1a8930512f71
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s
+
+; Test mempcy lowering where length is given by a complex but constant expression.
+; Loop guard should not be necessary since length is positive.
+
+define amdgpu_kernel void @_start(ptr %ptr) {
+; CHECK-LABEL: _start:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:  .LBB0_2: ; %dynamic-memcpy-expansion-main-body
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v0, s[2:3]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_u32 s4, s0, s2
+; CHECK-NEXT:    s_addc_u32 s5, s1, s3
+; CHECK-NEXT:    s_add_u32 s2, s2, 16
+; CHECK-NEXT:    s_addc_u32 s3, s3, 0
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; CHECK-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[2:3], 16
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:  ; %bb.3: ; %dynamic-memcpy-expansion-residual-cond
+; FIXME: Compare should be evaluated at compile time
+; CHECK-NEXT:    s_cmp_eq_u64 13, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_6
+; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
+; CHECK-NEXT:    s_sub_u32 s2, 29, 13
+; CHECK-NEXT:    s_subb_u32 s3, 0, 0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_add_u32 s2, s0, s2
+; CHECK-NEXT:    s_addc_u32 s3, s1, s3
+; CHECK-NEXT:    s_sub_u32 s4, 0, 13
+; CHECK-NEXT:    s_mov_b64 s[0:1], 0
+; CHECK-NEXT:    s_subb_u32 s5, 0, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:  .LBB0_5: ; %dynamic-memcpy-expansion-residual-body
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_add_u32 s6, s4, s0
+; CHECK-NEXT:    s_addc_u32 s7, s5, s1
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:29
+; CHECK-NEXT:    s_add_u32 s6, s2, s0
+; CHECK-NEXT:    s_addc_u32 s7, s3, s1
+; CHECK-NEXT:    s_add_u32 s0, s0, 1
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[0:1], 13
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    flat_store_byte v[2:3], v1
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_5
+; CHECK-NEXT:  .LBB0_6: ; %dynamic-memcpy-post-expansion
+; CHECK-NEXT:    s_endpgm
+  call void @llvm.memcpy.p0.p4.i64(ptr %ptr, ptr addrspace(4) null, i64 add (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 13), i1 false)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.memcpy.p0.p4.i64(ptr noalias writeonly captures(none), ptr addrspace(4) noalias readonly captures(none), i64, i1 immarg) #0
+
+attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }

>From 1608b56e1efdcbf55ff80b0532126f3ca2179c2b Mon Sep 17 00:00:00 2001
From: John Lu <John.Lu at amd.com>
Date: Thu, 19 Feb 2026 07:59:11 -0600
Subject: [PATCH 2/2] Don't use null for src pointer

Signed-off-by: John Lu <John.Lu at amd.com>
---
 .../CodeGen/AMDGPU/memcpy_const_compare.ll    | 46 ++++++++++---------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll b/llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll
index b1a8930512f71..84f196ff26e97 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy_const_compare.ll
@@ -4,26 +4,28 @@
 ; Test mempcy lowering where length is given by a complex but constant expression.
 ; Loop guard should not be necessary since length is positive.
 
-define amdgpu_kernel void @_start(ptr %ptr) {
+define amdgpu_kernel void @_start(ptr %ptrDst, ptr addrspace(4) %ptrSrc) {
 ; CHECK-LABEL: _start:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
-; CHECK-NEXT:    s_mov_b64 s[2:3], 0
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:  ; %bb.1: ; %dynamic-memcpy-expansion-main-body.preheader
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:  .LBB0_2: ; %dynamic-memcpy-expansion-main-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    global_load_dwordx4 v[4:7], v0, s[2:3]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_add_u32 s4, s0, s2
-; CHECK-NEXT:    s_addc_u32 s5, s1, s3
-; CHECK-NEXT:    s_add_u32 s2, s2, 16
-; CHECK-NEXT:    s_addc_u32 s3, s3, 0
-; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; CHECK-NEXT:    v_cmp_lt_u64_e64 s[4:5], s[2:3], 16
-; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; CHECK-NEXT:    s_add_u32 s6, s2, s4
+; CHECK-NEXT:    s_addc_u32 s7, s3, s5
+; CHECK-NEXT:    global_load_dwordx4 v[4:7], v0, s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s0, s4
+; CHECK-NEXT:    s_addc_u32 s7, s1, s5
+; CHECK-NEXT:    s_add_u32 s4, s4, 16
+; CHECK-NEXT:    s_addc_u32 s5, s5, 0
+; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
+; CHECK-NEXT:    v_cmp_lt_u64_e64 s[6:7], s[4:5], 16
+; CHECK-NEXT:    s_and_b64 vcc, exec, s[6:7]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
@@ -32,22 +34,24 @@ define amdgpu_kernel void @_start(ptr %ptr) {
 ; CHECK-NEXT:    s_cmp_eq_u64 13, 0
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_6
 ; CHECK-NEXT:  ; %bb.4: ; %dynamic-memcpy-expansion-residual-body.preheader
-; CHECK-NEXT:    s_sub_u32 s2, 29, 13
-; CHECK-NEXT:    s_subb_u32 s3, 0, 0
+; CHECK-NEXT:    s_sub_u32 s4, 29, 13
+; CHECK-NEXT:    s_subb_u32 s5, 0, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_add_u32 s2, s0, s2
-; CHECK-NEXT:    s_addc_u32 s3, s1, s3
-; CHECK-NEXT:    s_sub_u32 s4, 0, 13
+; CHECK-NEXT:    s_add_u32 s4, s0, s4
+; CHECK-NEXT:    s_addc_u32 s5, s1, s5
+; CHECK-NEXT:    s_sub_u32 s6, 0, 13
+; CHECK-NEXT:    s_subb_u32 s7, 0, 0
+; CHECK-NEXT:    s_add_u32 s2, s2, s6
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
-; CHECK-NEXT:    s_subb_u32 s5, 0, 0
+; CHECK-NEXT:    s_addc_u32 s3, s3, s7
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-NEXT:  .LBB0_5: ; %dynamic-memcpy-expansion-residual-body
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    s_add_u32 s6, s4, s0
-; CHECK-NEXT:    s_addc_u32 s7, s5, s1
-; CHECK-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:29
 ; CHECK-NEXT:    s_add_u32 s6, s2, s0
 ; CHECK-NEXT:    s_addc_u32 s7, s3, s1
+; CHECK-NEXT:    global_load_ubyte v1, v0, s[6:7] offset:29
+; CHECK-NEXT:    s_add_u32 s6, s4, s0
+; CHECK-NEXT:    s_addc_u32 s7, s5, s1
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1]
@@ -58,7 +62,7 @@ define amdgpu_kernel void @_start(ptr %ptr) {
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_5
 ; CHECK-NEXT:  .LBB0_6: ; %dynamic-memcpy-post-expansion
 ; CHECK-NEXT:    s_endpgm
-  call void @llvm.memcpy.p0.p4.i64(ptr %ptr, ptr addrspace(4) null, i64 add (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 13), i1 false)
+  call void @llvm.memcpy.p0.p4.i64(ptr %ptrDst, ptr addrspace(4) %ptrSrc, i64 add (i64 sub (i64 16, i64 ptrtoint (ptr addrspacecast (ptr addrspace(4) null to ptr) to i64)), i64 13), i1 false)
   ret void
 }
 



More information about the llvm-commits mailing list