[libc-commits] [libc] [libc][x86] Add Non-temporal code path for large memcpy (PR #187108)

Tue Mar 17 14:02:15 PDT 2026

================
@@ -143,14 +152,33 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
   // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196'
   // - 'dst' is 32B aligned,
   // - count >= 128.
-  while (offset + K_THREE_CACHELINES + 64 <= count) {
-    // Three cache lines at a time.
-    inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
-    inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
-    inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
-    // Copy one cache line at a time to prevent the use of `rep;movsb`.
-    for (size_t i = 0; i < 3; ++i, offset += K_ONE_CACHELINE)
-      builtin::Memcpy<K_ONE_CACHELINE>::block_offset(dst, src, offset);
+  // If we are using the Non-temporal stores, we don't need prefetching
+  bool need_prefetch_run = true;
+  if constexpr (x86::K_NTA_THRESHOLD != 0 && x86::K_AVX) {
+    if (count >= x86::K_NTA_THRESHOLD) {
+      while (offset + K_THREE_CACHELINES + 64 <= count) {
+        for (size_t i = 0; i < 3; ++i, offset += K_ONE_CACHELINE) {
+          generic::stream(dst + offset, generic::load<__m256i>(src + offset));
+          generic::stream(dst + offset + 32,
+                          generic::load<__m256i>(src + offset + 32));
+        }
+      }
+      generic::fence<__m256i>();
+      need_prefetch_run = false;
+    } else {
+      need_prefetch_run = true;
----------------
vonosmas wrote:

nit: this is not needed, since you already define need_prefetch_run to true above

https://github.com/llvm/llvm-project/pull/187108