[libc-commits] [libc] [libc] Optimize mempcy size thresholds (PR #70049)
Dmitry Vyukov via libc-commits
libc-commits at lists.llvm.org
Fri Oct 27 22:35:25 PDT 2023
https://github.com/dvyukov updated https://github.com/llvm/llvm-project/pull/70049
>From 4092097c698b64e23bf65e3fc89fa882ab20e783 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov at google.com>
Date: Tue, 24 Oct 2023 16:23:48 +0200
Subject: [PATCH] [libc] Optimize mempcy size thresholds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Adjust boundary conditions for sizes = 16/32/64.
See the added comment for explanations.
Results on a machine with AVX2, so sizes 64/128 affected:
│ baseline │ adjusted │
│ sec/op │ sec/op vs base │
memcpy/Google_A 5.701n ± 0% 5.551n ± 1% -2.63% (n=100)
memcpy/Google_B 3.817n ± 0% 3.776n ± 0% -1.07% (p=0.000 n=100)
memcpy/Google_D 11.35n ± 1% 11.32n ± 0% ~ (p=0.066 n=100)
memcpy/Google_U 3.874n ± 1% 3.821n ± 1% -1.37% (p=0.001 n=100)
memcpy/64 3.843n ± 0% 3.105n ± 3% -19.22% (n=50)
memcpy/128 4.842n ± 0% 3.818n ± 0% -21.15% (p=0.000 n=50)
---
.../memory_utils/x86_64/inline_memcpy.h | 32 +++++++++++++++----
1 file changed, 25 insertions(+), 7 deletions(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index f43230ffd8ad125..a1e2985fc1f89ed 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -55,7 +55,7 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
builtin::Memcpy<32>::block(dst, src);
align_to_next_boundary<32, Arg::Dst>(dst, src, count);
@@ -65,7 +65,7 @@ inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
if (count < 256)
return builtin::Memcpy<128>::head_tail(dst, src, count);
@@ -79,7 +79,7 @@ inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
// Aligning 'dst' on a 32B boundary.
@@ -120,7 +120,7 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
using namespace LIBC_NAMESPACE::x86;
prefetch_to_local_cache(src + kOneCacheline);
- if (count < 128)
+ if (count <= 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
prefetch_to_local_cache(src + kTwoCachelines);
prefetch_to_local_cache(src + kThreeCachelines);
@@ -149,6 +149,15 @@ inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
+#if defined(__AVX512F__)
+ constexpr size_t vector_size = 64;
+#elif defined(__AVX__)
+ constexpr size_t vector_size = 32;
+#elif defined(__SSE2__)
+ constexpr size_t vector_size = 16;
+#else
+ constexpr size_t vector_size = 8;
+#endif
if (count == 0)
return;
if (count == 1)
@@ -161,11 +170,20 @@ inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<4>::block(dst, src);
if (count < 8)
return builtin::Memcpy<4>::head_tail(dst, src, count);
- if (count < 16)
+ // If count is equal to a power of 2, we can handle it as head-tail
+ // of both smaller size and larger size (head-tail are either
+ // non-overlapping for smaller size, or completely collapsed
+ // for larger size). It seems to be more profitable to do the copy
+ // with the larger size, if it's natively supported (e.g. doing
+ // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
+ // But it's not profitable to use larger size if it's not natively
+ // supported: we will both use more instructions and handle fewer
+ // sizes in earlier branches.
+ if (vector_size >= 16 ? count < 16 : count <= 16)
return builtin::Memcpy<8>::head_tail(dst, src, count);
- if (count < 32)
+ if (vector_size >= 32 ? count < 32 : count <= 32)
return builtin::Memcpy<16>::head_tail(dst, src, count);
- if (count < 64)
+ if (vector_size >= 64 ? count < 64 : count <= 64)
return builtin::Memcpy<32>::head_tail(dst, src, count);
if constexpr (x86::kAvx) {
if constexpr (x86::kUseSoftwarePrefetching) {
More information about the libc-commits
mailing list