[libc-commits] [libc] cb1468d - [libc] Adding a version of memcpy w/ software prefetching
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Fri Jul 7 03:37:43 PDT 2023
Author: Guillaume Chatelet
Date: 2023-07-07T10:37:32Z
New Revision: cb1468d3cbb7774332647dee3475d4e4f85c21e2
URL: https://github.com/llvm/llvm-project/commit/cb1468d3cbb7774332647dee3475d4e4f85c21e2
DIFF: https://github.com/llvm/llvm-project/commit/cb1468d3cbb7774332647dee3475d4e4f85c21e2.diff
LOG: [libc] Adding a version of memcpy w/ software prefetching
For machines with a lot of cores, hardware prefetchers can saturate the memory bus when utilization is high.
In this case it is desirable to turn off the hardware prefetcher completely.
This has a big impact on the performance of memory functions such as `memcpy` that rely on the fact that the next cache line will be readily available.
This patch adds the 'LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING' compile time option that generates a version of memcpy with software prefetching. While not fully restoring the original performances it mitigates the impact to an acceptable level.
Reviewed By: rtenneti
Differential Revision: https://reviews.llvm.org/D154494
Added:
Modified:
libc/src/string/CMakeLists.txt
libc/src/string/memory_utils/op_builtin.h
libc/src/string/memory_utils/x86_64/memcpy_implementations.h
utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Removed:
################################################################################
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 330c50ea740dad..b010190d67c1f0 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -572,6 +572,8 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=sandybridge REQUIRE AVX)
add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memcpy(memcpy_x86_64_opt_sw_prefetch_sse4 COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=nehalem REQUIRE SSE4_2)
+ add_memcpy(memcpy_x86_64_opt_sw_prefetch_avx COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=sandybridge REQUIRE AVX)
add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memcpy(memcpy)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
index cf90c02ef1dc16..210ba601920018 100644
--- a/libc/src/string/memory_utils/op_builtin.h
+++ b/libc/src/string/memory_utils/op_builtin.h
@@ -23,19 +23,24 @@ namespace __llvm_libc::builtin {
// Memcpy
template <size_t Size> struct Memcpy {
static constexpr size_t SIZE = Size;
- LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) {
+ LIBC_INLINE static void block_offset(Ptr __restrict dst, CPtr __restrict src,
+ size_t offset) {
#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
- return __builtin_memcpy_inline(dst, src, SIZE);
+ return __builtin_memcpy_inline(dst + offset, src + offset, SIZE);
#else
// The codegen may be suboptimal.
for (size_t i = 0; i < Size; ++i)
- dst[i] = src[i];
+ dst[i + offset] = src[i + offset];
#endif
}
+ LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) {
+ block_offset(dst, src, 0);
+ }
+
LIBC_INLINE static void tail(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
- block(dst + count - SIZE, src + count - SIZE);
+ block_offset(dst, src, count - SIZE);
}
LIBC_INLINE static void head_tail(Ptr __restrict dst, CPtr __restrict src,
@@ -44,16 +49,21 @@ template <size_t Size> struct Memcpy {
tail(dst, src, count);
}
- LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
- size_t count) {
+ LIBC_INLINE static void loop_and_tail_offset(Ptr __restrict dst,
+ CPtr __restrict src,
+ size_t count, size_t offset) {
static_assert(Size > 1, "a loop of size 1 does not need tail");
- size_t offset = 0;
do {
- block(dst + offset, src + offset);
+ block_offset(dst, src, offset);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, src, count);
}
+
+ LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+ return loop_and_tail_offset(dst, src, count, 0);
+ }
};
///////////////////////////////////////////////////////////////////////////////
diff --git a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
index be870e72365c04..0d6e3710aebf40 100644
--- a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
@@ -8,6 +8,7 @@
#ifndef LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H
#define LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H
+#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR
#include "src/__support/macros/config.h" // LIBC_INLINE
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/string/memory_utils/op_builtin.h"
@@ -17,28 +18,53 @@
#include <stddef.h> // size_t
#include <stdint.h> // SIZE_MAX
+#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
+#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
+#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
+
+#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
namespace __llvm_libc {
+namespace x86 {
+
+LIBC_INLINE_VAR constexpr size_t kOneCacheline = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelines = 2 * kOneCacheline;
+LIBC_INLINE_VAR constexpr size_t kThreeCachelines = 3 * kOneCacheline;
+
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING);
+
+// Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only
+// above a certain threshold. Defaults to "do not use rep;movsb".
+#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX
+#endif
+LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
+ LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+
+} // namespace x86
+
+// TODO: Move to a shared header when appropriate.
+[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
+ __builtin_prefetch(addr, 0, 3);
+}
+
[[maybe_unused]] LIBC_INLINE void
-inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
- if (count == 0)
- return;
- if (count == 1)
- return builtin::Memcpy<1>::block(dst, src);
- if (count == 2)
- return builtin::Memcpy<2>::block(dst, src);
- if (count == 3)
- return builtin::Memcpy<3>::block(dst, src);
- if (count == 4)
- return builtin::Memcpy<4>::block(dst, src);
- if (count < 8)
- return builtin::Memcpy<4>::head_tail(dst, src, count);
- if (count < 16)
- return builtin::Memcpy<8>::head_tail(dst, src, count);
- if (count < 32)
- return builtin::Memcpy<16>::head_tail(dst, src, count);
- if (count < 64)
- return builtin::Memcpy<32>::head_tail(dst, src, count);
+inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
+ if (count < 128)
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ builtin::Memcpy<32>::block(dst, src);
+ align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+ return builtin::Memcpy<32>::loop_and_tail(dst, src, count);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
+ size_t count) {
if (count < 128)
return builtin::Memcpy<64>::head_tail(dst, src, count);
if (count < 256)
@@ -48,9 +74,81 @@ inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
}
-[[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_no_avx(Ptr __restrict dst,
- CPtr __restrict src,
- size_t count) {
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
+ CPtr __restrict src, size_t count) {
+ using namespace __llvm_libc::x86;
+ prefetch_to_local_cache(src + kOneCacheline);
+ if (count < 128)
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ prefetch_to_local_cache(src + kTwoCachelines);
+ // Aligning 'dst' on a 32B boundary.
+ builtin::Memcpy<32>::block(dst, src);
+ align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+ builtin::Memcpy<96>::block(dst, src);
+ size_t offset = 96;
+ // At this point:
+ // - we copied between 96B and 128B,
+ // - we prefetched cachelines at 'src + 64' and 'src + 128',
+ // - 'dst' is 32B aligned,
+ // - count >= 128.
+ if (count < 352) {
+ // Two cache lines at a time.
+ while (offset + kTwoCachelines + 32 <= count) {
+ prefetch_to_local_cache(src + offset + kOneCacheline);
+ prefetch_to_local_cache(src + offset + kTwoCachelines);
+ builtin::Memcpy<kTwoCachelines>::block_offset(dst, src, offset);
+ offset += kTwoCachelines;
+ }
+ } else {
+ // Three cache lines at a time.
+ while (offset + kThreeCachelines + 32 <= count) {
+ prefetch_to_local_cache(src + offset + kOneCacheline);
+ prefetch_to_local_cache(src + offset + kTwoCachelines);
+ prefetch_to_local_cache(src + offset + kThreeCachelines);
+ // It is likely that this copy will be turned into a 'rep;movsb' on
+ // non-AVX machines.
+ builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
+ offset += kThreeCachelines;
+ }
+ }
+ return builtin::Memcpy<32>::loop_and_tail_offset(dst, src, count, offset);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
+ CPtr __restrict src, size_t count) {
+ using namespace __llvm_libc::x86;
+ prefetch_to_local_cache(src + kOneCacheline);
+ if (count < 128)
+ return builtin::Memcpy<64>::head_tail(dst, src, count);
+ prefetch_to_local_cache(src + kTwoCachelines);
+ prefetch_to_local_cache(src + kThreeCachelines);
+ if (count < 256)
+ return builtin::Memcpy<128>::head_tail(dst, src, count);
+ // Aligning 'dst' on a 32B boundary.
+ builtin::Memcpy<32>::block(dst, src);
+ align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+ builtin::Memcpy<224>::block(dst, src);
+ size_t offset = 224;
+ // At this point:
+ // - we copied between 224B and 256B,
+ // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196'
+ // - 'dst' is 32B aligned,
+ // - count >= 128.
+ while (offset + kThreeCachelines + 64 <= count) {
+ // Three cache lines at a time.
+ prefetch_to_local_cache(src + offset + kOneCacheline);
+ prefetch_to_local_cache(src + offset + kTwoCachelines);
+ prefetch_to_local_cache(src + offset + kThreeCachelines);
+ builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
+ offset += kThreeCachelines;
+ }
+ return builtin::Memcpy<64>::loop_and_tail_offset(dst, src, count, offset);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
if (count == 0)
return;
if (count == 1)
@@ -69,46 +167,30 @@ inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
return builtin::Memcpy<16>::head_tail(dst, src, count);
if (count < 64)
return builtin::Memcpy<32>::head_tail(dst, src, count);
- if (count < 128)
- return builtin::Memcpy<64>::head_tail(dst, src, count);
- builtin::Memcpy<32>::block(dst, src);
- align_to_next_boundary<32, Arg::Dst>(dst, src, count);
- return builtin::Memcpy<32>::loop_and_tail(dst, src, count);
-}
-
-[[maybe_unused]] LIBC_INLINE void
-inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
- if constexpr (x86::kAvx)
- return inline_memcpy_x86_avx(dst, src, count);
- else
- return inline_memcpy_x86_no_avx(dst, src, count);
+ if constexpr (x86::kAvx) {
+ if constexpr (x86::kUseSoftwarePrefetching) {
+ return inline_memcpy_x86_avx_ge64_sw_prefetching(dst, src, count);
+ } else {
+ return inline_memcpy_x86_avx_ge64(dst, src, count);
+ }
+ } else {
+ if constexpr (x86::kUseSoftwarePrefetching) {
+ return inline_memcpy_x86_sse2_ge64_sw_prefetching(dst, src, count);
+ } else {
+ return inline_memcpy_x86_sse2_ge64(dst, src, count);
+ }
+ }
}
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst,
CPtr __restrict src, size_t count) {
- // Whether to use rep;movsb exclusively, not at all, or only above a certain
- // threshold.
-#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX
-#endif
-
-#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
-#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
-#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
-
-#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
-#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-
- static constexpr size_t kRepMovsbThreshold =
- LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
- if constexpr (kRepMovsbThreshold == 0) {
+ if constexpr (x86::kRepMovsbThreshold == 0) {
return x86::Memcpy::repmovsb(dst, src, count);
- } else if constexpr (kRepMovsbThreshold == SIZE_MAX) {
+ } else if constexpr (x86::kRepMovsbThreshold == SIZE_MAX) {
return inline_memcpy_x86(dst, src, count);
} else {
- if (LIBC_UNLIKELY(count >= kRepMovsbThreshold))
+ if (LIBC_UNLIKELY(count >= x86::kRepMovsbThreshold))
return x86::Memcpy::repmovsb(dst, src, count);
else
return inline_memcpy_x86(dst, src, count);
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f267d39da6b39e..7ace0fb45a660c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -26,6 +26,11 @@ PRINTF_COPTS = [
"LIBC_COPT_PRINTF_DISABLE_WRITE_INT",
]
+MEMORY_COPTS = [
+ # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
+ # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+]
+
# A flag to pick which `mpfr` to use for math tests.
# Usage: `-- at llvm-project//libc:mpfr=<disable|external|system>`.
# Flag documentation: https://bazel.build/extending/config
@@ -448,10 +453,10 @@ libc_support_library(
":__support_cpp_limits",
":__support_cpp_optional",
":__support_ctype_utils",
+ ":__support_fputil_dyadic_float",
":__support_fputil_fenv_impl",
":__support_fputil_fp_bits",
":__support_fputil_rounding_mode",
- ":__support_fputil_dyadic_float",
":__support_str_to_integer",
":__support_str_to_num_result",
":__support_uint128",
@@ -1147,8 +1152,8 @@ libc_support_library(
hdrs = ["src/math/generic/log_range_reduction.h"],
deps = [
":__support_common",
- ":__support_uint128",
":__support_fputil_dyadic_float",
+ ":__support_uint128",
":common_constants",
],
)
@@ -2002,6 +2007,7 @@ libc_support_library(
"src/string/memory_utils/op_x86.h",
"src/string/memory_utils/utils.h",
],
+ defines = MEMORY_COPTS,
textual_hdrs = [
"src/string/memory_utils/aarch64/memcmp_implementations.h",
"src/string/memory_utils/aarch64/memcpy_implementations.h",
More information about the libc-commits
mailing list