[libc-commits] [libc] [libc][x86] Use prefetch for write for memcpy (PR #90450)
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Mon Apr 29 03:41:36 PDT 2024
https://github.com/gchatelet created https://github.com/llvm/llvm-project/pull/90450
Currently when `LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING` is set we prefetch memory for read on the source buffer. This patch adds prefetch for write on the destination buffer.
>From 88b4b3e7710ba8393d1be6a8e64181e2688a4c78 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet at google.com>
Date: Mon, 29 Apr 2024 10:41:15 +0000
Subject: [PATCH] [libc][x86] Use prefetch for write for memcpy
Currently when `LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING` is set we prefetch memory for read on the source buffer. This patch adds prefetch for write on the destination buffer.
---
.../memory_utils/x86_64/inline_memcpy.h | 33 +++++++++++--------
1 file changed, 20 insertions(+), 13 deletions(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index ae61b1235bd08c..7a4e70bdf2d150 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -69,14 +69,21 @@ inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
}
+[[maybe_unused]] LIBC_INLINE void inline_memcpy_prefetch(Ptr __restrict dst,
+ CPtr __restrict src,
+ size_t distance) {
+ prefetch_to_local_cache(src + distance);
+ prefetch_for_write(dst + distance);
+}
+
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
- prefetch_to_local_cache(src + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
- prefetch_to_local_cache(src + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
// Aligning 'dst' on a 32B boundary.
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -90,17 +97,17 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
if (count < 352) {
// Two cache lines at a time.
while (offset + K_TWO_CACHELINES + 32 <= count) {
- prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
- prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
builtin::Memcpy<K_TWO_CACHELINES>::block_offset(dst, src, offset);
offset += K_TWO_CACHELINES;
}
} else {
// Three cache lines at a time.
while (offset + K_THREE_CACHELINES + 32 <= count) {
- prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
- prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
- prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
// It is likely that this copy will be turned into a 'rep;movsb' on
// non-AVX machines.
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
@@ -114,11 +121,11 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
- prefetch_to_local_cache(src + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, K_ONE_CACHELINE);
if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
- prefetch_to_local_cache(src + K_TWO_CACHELINES);
- prefetch_to_local_cache(src + K_THREE_CACHELINES);
+ inline_memcpy_prefetch(dst, src, K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, K_THREE_CACHELINES);
if (count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
// Aligning 'dst' on a 32B boundary.
@@ -133,9 +140,9 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
// - count >= 128.
while (offset + K_THREE_CACHELINES + 64 <= count) {
// Three cache lines at a time.
- prefetch_to_local_cache(src + offset + K_ONE_CACHELINE);
- prefetch_to_local_cache(src + offset + K_TWO_CACHELINES);
- prefetch_to_local_cache(src + offset + K_THREE_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_ONE_CACHELINE);
+ inline_memcpy_prefetch(dst, src, offset + K_TWO_CACHELINES);
+ inline_memcpy_prefetch(dst, src, offset + K_THREE_CACHELINES);
builtin::Memcpy<K_THREE_CACHELINES>::block_offset(dst, src, offset);
offset += K_THREE_CACHELINES;
}
More information about the libc-commits
mailing list