[libc-commits] [libc] [libc][x86] Add Non-temporal code path for large memcpy (PR #187108)

Tue Mar 17 21:19:03 PDT 2026

https://github.com/TocarIP updated https://github.com/llvm/llvm-project/pull/187108

>From ed1fb7a5b8fecce4741b119a3cc6991115b03645 Mon Sep 17 00:00:00 2001
From: Ilya Tokar <tokarip at google.com>
Date: Mon, 16 Mar 2026 18:24:11 -0400
Subject: [PATCH] [libc][x86] Add Non-temporal code path for large memcpy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Large memcopies are pretty rare, but are more common in ML
workloads (copying large matrixes/tensors, often to/from CPU host).

For large copies NTA stores can provide performance advantages for
both memcpy itself and the rest of the workload (by reducing
cache pollution). Other runtimes already have NTA path for large
copies, so add 1 to the llvm-libc.

Internal whole-program loadtests shows small, but statistically
significant improvement of 0.1%. ML specific bencahmrks showed 10-20%
performance gain, and fleetbench (https://github.com/google/fleetbench,
which has more up-to-date version of libc benchmarks) shows ~3% gain
(ns/byte for distributions taken from various applications).

[Memcpy_0]_L1      0.01950n ± 3%   0.01900n ± 5%       ~ (p=0.390 n=20)
[Memcpy_0]_L2      0.02300n ± 0%   0.02300n ± 0%       ~ (p=0.256 n=20)
[Memcpy_0]_LLC     0.1335n ± 1%    0.1310n ± 1%  -1.87% (p=0.000 n=20)
[Memcpy_0]_Cold    0.1540n ± 2%    0.1520n ± 1%  -1.30% (p=0.021 n=20)
[Memcpy_1]_L1      0.04300n ± 5%   0.04200n ± 2%  -2.33% (p=0.000 n=20)
[Memcpy_1]_L2      0.05000n ± 2%   0.04800n ± 0%  -4.00% (p=0.000 n=20)
[Memcpy_1]_LLC     0.2500n ± 2%    0.2390n ± 1%  -4.40% (p=0.000 n=20)
[Memcpy_1]_Cold    0.2750n ± 1%    0.2640n ± 1%  -4.00% (p=0.000 n=20)
[Memcpy_2]_L1      0.03800n ± 3%   0.03800n ± 3%       ~ (p=0.420 n=20)
[Memcpy_2]_L2      0.04400n ± 2%   0.04300n ± 0%  -2.27% (p=0.000 n=20)
[Memcpy_2]_LLC     0.2320n ± 1%    0.2220n ± 1%  -4.31% (p=0.000 n=20)
[Memcpy_2]_Cold    0.2565n ± 1%    0.2460n ± 1%  -4.09% (p=0.000 n=20)
[Memcpy_3]_L1      0.1380n ± 1%    0.1355n ± 2%       ~ (p=0.095 n=20)
[Memcpy_3]_L2      0.1490n ± 1%    0.1430n ± 1%  -4.03% (p=0.000 n=20)
[Memcpy_3]_LLC     0.7955n ± 1%    0.7450n ± 0%  -6.35% (p=0.000 n=20)
[Memcpy_3]_Cold    0.8495n ± 1%    0.7935n ± 0%  -6.59% (p=0.000 n=20)
[Memcpy_4]_L1      0.04000n ± 3%   0.03900n ± 3%       ~ (p=0.466 n=20)
[Memcpy_4]_L2      0.04500n ± 2%   0.04400n ± 2%       ~ (p=0.130 n=20)
[Memcpy_4]_LLC     0.2040n ± 1%    0.1950n ± 1%  -4.41% (p=0.000 n=20)
[Memcpy_4]_Cold    0.2240n ± 1%    0.2150n ± 1%  -4.02% (p=0.000 n=20)
[Memcpy_5]_L1      0.05800n ± 3%   0.06050n ± 1%  +4.31% (p=0.000 n=20)
[Memcpy_5]_L2      0.06400n ± 0%   0.06400n ± 2%   0.00% (p=0.004 n=20)
[Memcpy_5]_LLC     0.3320n ± 1%    0.3140n ± 1%  -5.42% (p=0.000 n=20)
[Memcpy_5]_Cold    0.3620n ± 1%    0.3430n ± 0%  -5.25% (p=0.000 n=20)
[Memcpy_6]_L1      0.05700n ± 2%   0.05750n ± 3%       ~ (p=0.403 n=20)
[Memcpy_6]_L2      0.06500n ± 0%   0.06250n ± 1%  -3.85% (p=0.000 n=20)
[Memcpy_6]_LLC     0.3410n ± 1%    0.3205n ± 1%  -6.01% (p=0.000 n=20)
[Memcpy_6]_Cold    0.3670n ± 1%    0.3470n ± 1%  -5.45% (p=0.000 n=20)
[Memcpy_7]_L1      0.05900n ± 2%   0.05900n ± 2%       ~ (p=0.296 n=20)
[Memcpy_7]_L2      0.06400n ± 2%   0.06400n ± 0%       ~ (p=0.327 n=20)
[Memcpy_7]_LLC     0.3145n ± 1%    0.2965n ± 1%  -5.72% (p=0.000 n=20)
[Memcpy_7]_Cold    0.3410n ± 1%    0.3220n ± 0%  -5.57% (p=0.000 n=20)
[Memcpy_8]_L1      0.03600n ± 3%   0.03600n ± 3%       ~ (p=0.804 n=20)
[Memcpy_8]_L2      0.04200n ± 0%   0.04100n ± 2%  -2.38% (p=0.000 n=20)
[Memcpy_8]_LLC     0.2210n ± 1%    0.2090n ± 1%  -5.43% (p=0.000 n=20)
[Memcpy_8]_Cold    0.2415n ± 1%    0.2300n ± 1%  -4.76% (p=0.000 n=20)
geomean            0.1184n         0.1148n       -3.03%
---
 libc/src/string/memory_utils/op_x86.h         |  5 +++
 .../memory_utils/x86_64/inline_memcpy.h       | 42 +++++++++++++++----
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 215cafb9fcfeb..d25f58c19b723 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -73,6 +73,11 @@ struct Memcpy {
 namespace LIBC_NAMESPACE_DECL {
 namespace generic {
 
+template <typename T> LIBC_INLINE void stream(Ptr dst, T value) {
+  __builtin_nontemporal_store(value, reinterpret_cast<T*>(dst));
+}
+template <typename T> LIBC_INLINE void fence() { _mm_sfence(); }
+
 ///////////////////////////////////////////////////////////////////////////////
 // Specializations for uint16_t
 template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index bf3aa1f755ad6..8c1e1b4157eb7 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -36,6 +36,15 @@ LIBC_INLINE_VAR constexpr size_t K_THREE_CACHELINES = 3 * K_ONE_CACHELINE;
 LIBC_INLINE_VAR constexpr bool K_USE_SOFTWARE_PREFETCHING =
     LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING);
 
+// Whether to use NTA stores and what threshold for switching to NTA
+#ifdef LIBC_COPT_MEMCPY_X86_USE_NTA_STORES
+// Mostly based on empirical data. Theoretical justification:
+// upper bound of L2 size is 1MB on most x86 machines.
+LIBC_INLINE_VAR constexpr size_t K_NTA_THRESHOLD = 1 << 20;
+#else
+LIBC_INLINE_VAR constexpr size_t K_NTA_THRESHOLD = 0;
+#endif
+
 // Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only
 // above a certain threshold. Defaults to "do not use rep;movsb".
 #ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
@@ -143,14 +152,31 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
   // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196'
   // - 'dst' is 32B aligned,
   // - count >= 128.
-  while (offset + K_THREE_CACHELINES + 64 <= count) {
-    // Three cache lines at a time.
-    inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
-    inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
-    inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
-    // Copy one cache line at a time to prevent the use of `rep;movsb`.
-    for (size_t i = 0; i < 3; ++i, offset += K_ONE_CACHELINE)
-      builtin::Memcpy<K_ONE_CACHELINE>::block_offset(dst, src, offset);
+  // If we are using the Non-temporal stores, we don't need prefetching
+  bool need_prefetch_run = true;
+  if constexpr (x86::K_NTA_THRESHOLD != 0) {
+    if (count >= x86::K_NTA_THRESHOLD) {
+      while (offset + K_THREE_CACHELINES + 64 <= count) {
+        for (size_t i = 0; i < 3; ++i, offset += K_ONE_CACHELINE) {
+          generic::stream(dst + offset, generic::load<__m256i>(src + offset));
+          generic::stream(dst + offset + 32,
+                          generic::load<__m256i>(src + offset + 32));
+        }
+      }
+      generic::fence<__m256i>();
+      need_prefetch_run = false;
+    }
+  }
+  if (need_prefetch_run) {
+    while (offset + K_THREE_CACHELINES + 64 <= count) {
+      // Three cache lines at a time.
+      inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+      inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
+      inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
+      // Copy one cache line at a time to prevent the use of `rep;movsb`.
+      for (size_t i = 0; i < 3; ++i, offset += K_ONE_CACHELINE)
+        builtin::Memcpy<K_ONE_CACHELINE>::block_offset(dst, src, offset);
+    }
   }
   // We don't use 'loop_and_tail_offset' because it assumes at least one
   // iteration of the loop.