[libc-commits] [libc] 55174f9 - [libc][SVE] add sve handling for memcpy with count less than 32b (#167446)
via libc-commits
libc-commits at lists.llvm.org
Tue Feb 3 07:29:49 PST 2026
Author: Schrodinger ZHU Yifan
Date: 2026-02-03T10:26:44-05:00
New Revision: 55174f936ba036f22f29c936aee69e4f4cda338a
URL: https://github.com/llvm/llvm-project/commit/55174f936ba036f22f29c936aee69e4f4cda338a
DIFF: https://github.com/llvm/llvm-project/commit/55174f936ba036f22f29c936aee69e4f4cda338a.diff
LOG: [libc][SVE] add sve handling for memcpy with count less than 32b (#167446)
Add SVE optimization for AArch64 architectures. The idea is to use
predicate registers to avoid branching.
Microbench in repo shows considerable improvements on NV GB10 (locked on
largest X925):
```
======================================================================
BENCHMARK STATISTICS (time in nanoseconds)
======================================================================
memcpy_Google_A:
Old - Mean: 3.1257 ns, Median: 3.1162 ns
New - Mean: 2.8402 ns, Median: 2.8265 ns
Improvement: +9.14% (mean), +9.30% (median)
memcpy_Google_B:
Old - Mean: 2.3171 ns, Median: 2.3159 ns
New - Mean: 1.6589 ns, Median: 1.6593 ns
Improvement: +28.40% (mean), +28.35% (median)
memcpy_Google_D:
Old - Mean: 8.7602 ns, Median: 8.7645 ns
New - Mean: 8.4307 ns, Median: 8.4308 ns
Improvement: +3.76% (mean), +3.81% (median)
memcpy_Google_L:
Old - Mean: 1.7137 ns, Median: 1.7091 ns
New - Mean: 1.4530 ns, Median: 1.4553 ns
Improvement: +15.22% (mean), +14.85% (median)
memcpy_Google_M:
Old - Mean: 1.9823 ns, Median: 1.9825 ns
New - Mean: 1.4826 ns, Median: 1.4840 ns
Improvement: +25.20% (mean), +25.15% (median)
memcpy_Google_Q:
Old - Mean: 1.6812 ns, Median: 1.6784 ns
New - Mean: 1.1538 ns, Median: 1.1517 ns
Improvement: +31.37% (mean), +31.38% (median)
memcpy_Google_S:
Old - Mean: 2.1816 ns, Median: 2.1786 ns
New - Mean: 1.6297 ns, Median: 1.6287 ns
Improvement: +25.29% (mean), +25.24% (median)
memcpy_Google_U:
Old - Mean: 2.2851 ns, Median: 2.2825 ns
New - Mean: 1.7219 ns, Median: 1.7187 ns
Improvement: +24.65% (mean), +24.70% (median)
memcpy_Google_W:
Old - Mean: 2.0408 ns, Median: 2.0361 ns
New - Mean: 1.5260 ns, Median: 1.5252 ns
Improvement: +25.23% (mean), +25.09% (median)
uniform_384_to_4096:
Old - Mean: 26.9067 ns, Median: 26.8845 ns
New - Mean: 26.8083 ns, Median: 26.8149 ns
Improvement: +0.37% (mean), +0.26% (median)
```
The beginning of the memcpy function looks like the following:
```
Dump of assembler code for function _ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm:
0x0000000000001340 <+0>: cbz x2, 0x143c <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+252>
0x0000000000001344 <+4>: cbz x0, 0x1440 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+256>
0x0000000000001348 <+8>: cbz x1, 0x1444 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+260>
0x000000000000134c <+12>: subs x8, x2, #0x20
0x0000000000001350 <+16>: b.hi 0x1374 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+52> // b.pmore
0x0000000000001354 <+20>: rdvl x8, #1
0x0000000000001358 <+24>: whilelo p0.b, xzr, x2
0x000000000000135c <+28>: ld1b {z0.b}, p0/z, [x1]
0x0000000000001360 <+32>: whilelo p1.b, x8, x2
0x0000000000001364 <+36>: ld1b {z1.b}, p1/z, [x1, #1, mul vl]
0x0000000000001368 <+40>: st1b {z0.b}, p0, [x0]
0x000000000000136c <+44>: st1b {z1.b}, p1, [x0, #1, mul vl]
0x0000000000001370 <+48>: ret
```
---------
Co-authored-by: Guillaume Chatelet <chatelet.guillaume at gmail.com>
Added:
Modified:
libc/src/string/memory_utils/aarch64/inline_memcpy.h
Removed:
################################################################################
diff --git a/libc/src/string/memory_utils/aarch64/inline_memcpy.h b/libc/src/string/memory_utils/aarch64/inline_memcpy.h
index 11cf022e12b1f..0c9224010784f 100644
--- a/libc/src/string/memory_utils/aarch64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/aarch64/inline_memcpy.h
@@ -9,17 +9,40 @@
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_MEMCPY_H
#include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/properties/cpu_features.h"
#include "src/string/memory_utils/op_builtin.h"
#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
+#if defined(LIBC_TARGET_CPU_HAS_SVE)
+#include <arm_sve.h>
+#endif
namespace LIBC_NAMESPACE_DECL {
-
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
+ // Always avoid emit any memory operation if count == 0.
if (count == 0)
return;
+ // Use predicated load/store on SVE available targets to avoid branching in
+ // small cases.
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+ auto src_ptr = reinterpret_cast<const uint8_t *>(src);
+ auto dst_ptr = reinterpret_cast<uint8_t *>(dst);
+ if (count <= 16) {
+ const svbool_t mask = svwhilelt_b8_u64(0, count);
+ svst1_u8(mask, dst_ptr, svld1_u8(mask, src_ptr));
+ return;
+ }
+ if (count <= 32) {
+ const size_t vlen = svcntb();
+ svbool_t m0 = svwhilelt_b8_u64(0, count);
+ svbool_t m1 = svwhilelt_b8_u64(vlen, count);
+ svst1_u8(m0, dst_ptr, svld1_u8(m0, src_ptr));
+ svst1_u8(m1, dst_ptr + vlen, svld1_u8(m1, src_ptr + vlen));
+ return;
+ }
+#else
if (count == 1)
return builtin::Memcpy<1>::block(dst, src);
if (count == 2)
@@ -34,6 +57,7 @@ inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<8>::head_tail(dst, src, count);
if (count < 32)
return builtin::Memcpy<16>::head_tail(dst, src, count);
+#endif
if (count < 64)
return builtin::Memcpy<32>::head_tail(dst, src, count);
if (count < 128)
More information about the libc-commits
mailing list