[libc-commits] [libc] [libc][x86] Use prefetch for write for memcpy (PR #90450)

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Mon Apr 29 03:41:36 PDT 2024


https://github.com/gchatelet created https://github.com/llvm/llvm-project/pull/90450

Currently when `LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING` is set we prefetch memory for read on the source buffer. This patch adds prefetch for write on the destination buffer.


>From 88b4b3e7710ba8393d1be6a8e64181e2688a4c78 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet at google.com>
Date: Mon, 29 Apr 2024 10:41:15 +0000
Subject: [PATCH] [libc][x86] Use prefetch for write for memcpy

Currently when `LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING` is set we prefetch memory for read on the source buffer. This patch adds prefetch for write on the destination buffer.
---
 .../memory_utils/x86_64/inline_memcpy.h       | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index ae61b1235bd08c..7a4e70bdf2d150 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -69,14 +69,21 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
   return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
 }
 
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch(Ptr __restrict dst,
+                                                         CPtr __restrict src,
+                                                         size_t distance) {
+  prefetch_to_local_cache(src + distance);
+  prefetch_for_write(dst + distance);
+}
+
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
                                            CPtr __restrict src, size_t count) {
   using namespace LIBC_NAMESPACE::x86;
-  prefetch_to_local_cache(src + K_ONE_CACHELINE);
+  inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
   if (count <= 128)
     return builtin::Memcpy<64>::head_tail(dst, src, count);
-  prefetch_to_local_cache(src + K_TWO_CACHELINES);
+  inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
   // Aligning 'dst' on a 32B boundary.
   builtin::Memcpy<32>::block(dst, src);
   align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -90,17 +97,17 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
   if (count < 352) {
     // Two cache lines at a time.
     while (offset + K_TWO_CACHELINES + 32 <= count) {
-      prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
-      prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
+      inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+      inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
       builtin::Memcpy<K_TWO_CACHELINES>::block_offset(dst, src, offset);
       offset += K_TWO_CACHELINES;
     }
   } else {
     // Three cache lines at a time.
     while (offset + K_THREE_CACHELINES + 32 <= count) {
-      prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
-      prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
-      prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
+      inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+      inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
+      inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
       // It is likely that this copy will be turned into a 'rep;movsb' on
       // non-AVX machines.
       builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
@@ -114,11 +121,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
 inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
                                           CPtr __restrict src, size_t count) {
   using namespace LIBC_NAMESPACE::x86;
-  prefetch_to_local_cache(src + K_ONE_CACHELINE);
+  inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
   if (count <= 128)
     return builtin::Memcpy<64>::head_tail(dst, src, count);
-  prefetch_to_local_cache(src + K_TWO_CACHELINES);
-  prefetch_to_local_cache(src + K_THREE_CACHELINES);
+  inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
+  inline_memcpy_prefetch(dst, src, K_THREE_CACHELINES);
   if (count < 256)
     return builtin::Memcpy<128>::head_tail(dst, src, count);
   // Aligning 'dst' on a 32B boundary.
@@ -133,9 +140,9 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
   // - count >= 128.
   while (offset + K_THREE_CACHELINES + 64 <= count) {
     // Three cache lines at a time.
-    prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
-    prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
-    prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
+    inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+    inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
+    inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
     builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
     offset += K_THREE_CACHELINES;
   }



More information about the libc-commits mailing list