[libc-commits] [libc] 55174f9 - [libc][SVE] add sve handling for memcpy with count less than 32b (#167446)

via libc-commits libc-commits at lists.llvm.org
Tue Feb 3 07:29:49 PST 2026


Author: Schrodinger ZHU Yifan
Date: 2026-02-03T10:26:44-05:00
New Revision: 55174f936ba036f22f29c936aee69e4f4cda338a

URL: https://github.com/llvm/llvm-project/commit/55174f936ba036f22f29c936aee69e4f4cda338a
DIFF: https://github.com/llvm/llvm-project/commit/55174f936ba036f22f29c936aee69e4f4cda338a.diff

LOG: [libc][SVE] add sve handling for memcpy with count less than 32b (#167446)

Add SVE optimization for AArch64 architectures. The idea is to use
predicate registers to avoid branching.
Microbench in repo shows considerable improvements on NV GB10 (locked on
largest X925):

```
======================================================================
BENCHMARK STATISTICS (time in nanoseconds)
======================================================================

memcpy_Google_A:
  Old - Mean: 3.1257 ns, Median: 3.1162 ns
  New - Mean: 2.8402 ns, Median: 2.8265 ns
  Improvement: +9.14% (mean), +9.30% (median)

memcpy_Google_B:
  Old - Mean: 2.3171 ns, Median: 2.3159 ns
  New - Mean: 1.6589 ns, Median: 1.6593 ns
  Improvement: +28.40% (mean), +28.35% (median)

memcpy_Google_D:
  Old - Mean: 8.7602 ns, Median: 8.7645 ns
  New - Mean: 8.4307 ns, Median: 8.4308 ns
  Improvement: +3.76% (mean), +3.81% (median)

memcpy_Google_L:
  Old - Mean: 1.7137 ns, Median: 1.7091 ns
  New - Mean: 1.4530 ns, Median: 1.4553 ns
  Improvement: +15.22% (mean), +14.85% (median)

memcpy_Google_M:
  Old - Mean: 1.9823 ns, Median: 1.9825 ns
  New - Mean: 1.4826 ns, Median: 1.4840 ns
  Improvement: +25.20% (mean), +25.15% (median)

memcpy_Google_Q:
  Old - Mean: 1.6812 ns, Median: 1.6784 ns
  New - Mean: 1.1538 ns, Median: 1.1517 ns
  Improvement: +31.37% (mean), +31.38% (median)

memcpy_Google_S:
  Old - Mean: 2.1816 ns, Median: 2.1786 ns
  New - Mean: 1.6297 ns, Median: 1.6287 ns
  Improvement: +25.29% (mean), +25.24% (median)

memcpy_Google_U:
  Old - Mean: 2.2851 ns, Median: 2.2825 ns
  New - Mean: 1.7219 ns, Median: 1.7187 ns
  Improvement: +24.65% (mean), +24.70% (median)

memcpy_Google_W:
  Old - Mean: 2.0408 ns, Median: 2.0361 ns
  New - Mean: 1.5260 ns, Median: 1.5252 ns
  Improvement: +25.23% (mean), +25.09% (median)

uniform_384_to_4096:
  Old - Mean: 26.9067 ns, Median: 26.8845 ns
  New - Mean: 26.8083 ns, Median: 26.8149 ns
  Improvement: +0.37% (mean), +0.26% (median)
```
The beginning of the memcpy function looks like the following:
```
Dump of assembler code for function _ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm:
   0x0000000000001340 <+0>:     cbz     x2, 0x143c <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+252>
   0x0000000000001344 <+4>:     cbz     x0, 0x1440 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+256>
   0x0000000000001348 <+8>:     cbz     x1, 0x1444 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+260>
   0x000000000000134c <+12>:    subs    x8, x2, #0x20
   0x0000000000001350 <+16>:    b.hi    0x1374 <_ZN22__llvm_libc_22_0_0_git6memcpyEPvPKvm+52>  // b.pmore
   0x0000000000001354 <+20>:    rdvl    x8, #1
   0x0000000000001358 <+24>:    whilelo p0.b, xzr, x2
   0x000000000000135c <+28>:    ld1b    {z0.b}, p0/z, [x1]
   0x0000000000001360 <+32>:    whilelo p1.b, x8, x2
   0x0000000000001364 <+36>:    ld1b    {z1.b}, p1/z, [x1, #1, mul vl]
   0x0000000000001368 <+40>:    st1b    {z0.b}, p0, [x0]
   0x000000000000136c <+44>:    st1b    {z1.b}, p1, [x0, #1, mul vl]
   0x0000000000001370 <+48>:    ret
```

---------

Co-authored-by: Guillaume Chatelet <chatelet.guillaume at gmail.com>

Added: 
    

Modified: 
    libc/src/string/memory_utils/aarch64/inline_memcpy.h

Removed: 
    


################################################################################
diff  --git a/libc/src/string/memory_utils/aarch64/inline_memcpy.h b/libc/src/string/memory_utils/aarch64/inline_memcpy.h
index 11cf022e12b1f..0c9224010784f 100644
--- a/libc/src/string/memory_utils/aarch64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/aarch64/inline_memcpy.h
@@ -9,17 +9,40 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_AARCH64_INLINE_MEMCPY_H
 
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
+#include "src/__support/macros/properties/cpu_features.h"
 #include "src/string/memory_utils/op_builtin.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
+#if defined(LIBC_TARGET_CPU_HAS_SVE)
+#include <arm_sve.h>
+#endif
 namespace LIBC_NAMESPACE_DECL {
-
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
+  // Always avoid emit any memory operation if count == 0.
   if (count == 0)
     return;
+  // Use predicated load/store on SVE available targets to avoid branching in
+  // small cases.
+#ifdef LIBC_TARGET_CPU_HAS_SVE
+  auto src_ptr = reinterpret_cast<const uint8_t *>(src);
+  auto dst_ptr = reinterpret_cast<uint8_t *>(dst);
+  if (count <= 16) {
+    const svbool_t mask = svwhilelt_b8_u64(0, count);
+    svst1_u8(mask, dst_ptr, svld1_u8(mask, src_ptr));
+    return;
+  }
+  if (count <= 32) {
+    const size_t vlen = svcntb();
+    svbool_t m0 = svwhilelt_b8_u64(0, count);
+    svbool_t m1 = svwhilelt_b8_u64(vlen, count);
+    svst1_u8(m0, dst_ptr, svld1_u8(m0, src_ptr));
+    svst1_u8(m1, dst_ptr + vlen, svld1_u8(m1, src_ptr + vlen));
+    return;
+  }
+#else
   if (count == 1)
     return builtin::Memcpy<1>::block(dst, src);
   if (count == 2)
@@ -34,6 +57,7 @@ inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
     return builtin::Memcpy<8>::head_tail(dst, src, count);
   if (count < 32)
     return builtin::Memcpy<16>::head_tail(dst, src, count);
+#endif
   if (count < 64)
     return builtin::Memcpy<32>::head_tail(dst, src, count);
   if (count < 128)


        


More information about the libc-commits mailing list