[libc-commits] [libc] cb1468d - [libc] Adding a version of memcpy w/ software prefetching

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Fri Jul 7 03:37:43 PDT 2023


Author: Guillaume Chatelet
Date: 2023-07-07T10:37:32Z
New Revision: cb1468d3cbb7774332647dee3475d4e4f85c21e2

URL: https://github.com/llvm/llvm-project/commit/cb1468d3cbb7774332647dee3475d4e4f85c21e2
DIFF: https://github.com/llvm/llvm-project/commit/cb1468d3cbb7774332647dee3475d4e4f85c21e2.diff

LOG: [libc] Adding a version of memcpy w/ software prefetching

For machines with a lot of cores, hardware prefetchers can saturate the memory bus when utilization is high.
In this case it is desirable to turn off the hardware prefetcher completely.
This has a big impact on the performance of memory functions such as `memcpy` that rely on the fact that the next cache line will be readily available.

This patch adds the 'LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING' compile time option that generates a version of memcpy with software prefetching. While not fully restoring the original performances it mitigates the impact to an acceptable level.

Reviewed By: rtenneti

Differential Revision: https://reviews.llvm.org/D154494

Added: 
    

Modified: 
    libc/src/string/CMakeLists.txt
    libc/src/string/memory_utils/op_builtin.h
    libc/src/string/memory_utils/x86_64/memcpy_implementations.h
    utils/bazel/llvm-project-overlay/libc/BUILD.bazel

Removed: 
    


################################################################################
diff  --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 330c50ea740dad..b010190d67c1f0 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -572,6 +572,8 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_memcpy(memcpy_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
   add_memcpy(memcpy_x86_64_opt_avx    COMPILE_OPTIONS -march=sandybridge    REQUIRE AVX)
   add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_memcpy(memcpy_x86_64_opt_sw_prefetch_sse4   COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=nehalem        REQUIRE SSE4_2)
+  add_memcpy(memcpy_x86_64_opt_sw_prefetch_avx    COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=sandybridge    REQUIRE AVX)
   add_memcpy(memcpy_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memcpy(memcpy)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})

diff  --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
index cf90c02ef1dc16..210ba601920018 100644
--- a/libc/src/string/memory_utils/op_builtin.h
+++ b/libc/src/string/memory_utils/op_builtin.h
@@ -23,19 +23,24 @@ namespace __llvm_libc::builtin {
 // Memcpy
 template <size_t Size> struct Memcpy {
   static constexpr size_t SIZE = Size;
-  LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) {
+  LIBC_INLINE static void block_offset(Ptr __restrict dst, CPtr __restrict src,
+                                       size_t offset) {
 #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
-    return __builtin_memcpy_inline(dst, src, SIZE);
+    return __builtin_memcpy_inline(dst + offset, src + offset, SIZE);
 #else
     // The codegen may be suboptimal.
     for (size_t i = 0; i < Size; ++i)
-      dst[i] = src[i];
+      dst[i + offset] = src[i + offset];
 #endif
   }
 
+  LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) {
+    block_offset(dst, src, 0);
+  }
+
   LIBC_INLINE static void tail(Ptr __restrict dst, CPtr __restrict src,
                                size_t count) {
-    block(dst + count - SIZE, src + count - SIZE);
+    block_offset(dst, src, count - SIZE);
   }
 
   LIBC_INLINE static void head_tail(Ptr __restrict dst, CPtr __restrict src,
@@ -44,16 +49,21 @@ template <size_t Size> struct Memcpy {
     tail(dst, src, count);
   }
 
-  LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
-                                        size_t count) {
+  LIBC_INLINE static void loop_and_tail_offset(Ptr __restrict dst,
+                                               CPtr __restrict src,
+                                               size_t count, size_t offset) {
     static_assert(Size > 1, "a loop of size 1 does not need tail");
-    size_t offset = 0;
     do {
-      block(dst + offset, src + offset);
+      block_offset(dst, src, offset);
       offset += SIZE;
     } while (offset < count - SIZE);
     tail(dst, src, count);
   }
+
+  LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
+                                        size_t count) {
+    return loop_and_tail_offset(dst, src, count, 0);
+  }
 };
 
 ///////////////////////////////////////////////////////////////////////////////

diff  --git a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
index be870e72365c04..0d6e3710aebf40 100644
--- a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h
@@ -8,6 +8,7 @@
 #ifndef LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H
 #define LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H
 
+#include "src/__support/macros/attributes.h"   // LIBC_INLINE_VAR
 #include "src/__support/macros/config.h"       // LIBC_INLINE
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
 #include "src/string/memory_utils/op_builtin.h"
@@ -17,28 +18,53 @@
 #include <stddef.h> // size_t
 #include <stdint.h> // SIZE_MAX
 
+#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
+#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
+#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
+
+#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
 namespace __llvm_libc {
 
+namespace x86 {
+
+LIBC_INLINE_VAR constexpr size_t kOneCacheline = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelines = 2 * kOneCacheline;
+LIBC_INLINE_VAR constexpr size_t kThreeCachelines = 3 * kOneCacheline;
+
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+    LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING);
+
+// Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only
+// above a certain threshold. Defaults to "do not use rep;movsb".
+#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX
+#endif
+LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
+    LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+
+} // namespace x86
+
+// TODO: Move to a shared header when appropriate.
+[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
+  __builtin_prefetch(addr, 0, 3);
+}
+
 [[maybe_unused]] LIBC_INLINE void
-inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
-  if (count == 0)
-    return;
-  if (count == 1)
-    return builtin::Memcpy<1>::block(dst, src);
-  if (count == 2)
-    return builtin::Memcpy<2>::block(dst, src);
-  if (count == 3)
-    return builtin::Memcpy<3>::block(dst, src);
-  if (count == 4)
-    return builtin::Memcpy<4>::block(dst, src);
-  if (count < 8)
-    return builtin::Memcpy<4>::head_tail(dst, src, count);
-  if (count < 16)
-    return builtin::Memcpy<8>::head_tail(dst, src, count);
-  if (count < 32)
-    return builtin::Memcpy<16>::head_tail(dst, src, count);
-  if (count < 64)
-    return builtin::Memcpy<32>::head_tail(dst, src, count);
+inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
+                            size_t count) {
+  if (count < 128)
+    return builtin::Memcpy<64>::head_tail(dst, src, count);
+  builtin::Memcpy<32>::block(dst, src);
+  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+  return builtin::Memcpy<32>::loop_and_tail(dst, src, count);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src,
+                           size_t count) {
   if (count < 128)
     return builtin::Memcpy<64>::head_tail(dst, src, count);
   if (count < 256)
@@ -48,9 +74,81 @@ inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
   return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
 }
 
-[[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_no_avx(Ptr __restrict dst,
-                                                           CPtr __restrict src,
-                                                           size_t count) {
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst,
+                                           CPtr __restrict src, size_t count) {
+  using namespace __llvm_libc::x86;
+  prefetch_to_local_cache(src + kOneCacheline);
+  if (count < 128)
+    return builtin::Memcpy<64>::head_tail(dst, src, count);
+  prefetch_to_local_cache(src + kTwoCachelines);
+  // Aligning 'dst' on a 32B boundary.
+  builtin::Memcpy<32>::block(dst, src);
+  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+  builtin::Memcpy<96>::block(dst, src);
+  size_t offset = 96;
+  // At this point:
+  // - we copied between 96B and 128B,
+  // - we prefetched cachelines at 'src + 64' and 'src + 128',
+  // - 'dst' is 32B aligned,
+  // - count >= 128.
+  if (count < 352) {
+    // Two cache lines at a time.
+    while (offset + kTwoCachelines + 32 <= count) {
+      prefetch_to_local_cache(src + offset + kOneCacheline);
+      prefetch_to_local_cache(src + offset + kTwoCachelines);
+      builtin::Memcpy<kTwoCachelines>::block_offset(dst, src, offset);
+      offset += kTwoCachelines;
+    }
+  } else {
+    // Three cache lines at a time.
+    while (offset + kThreeCachelines + 32 <= count) {
+      prefetch_to_local_cache(src + offset + kOneCacheline);
+      prefetch_to_local_cache(src + offset + kTwoCachelines);
+      prefetch_to_local_cache(src + offset + kThreeCachelines);
+      // It is likely that this copy will be turned into a 'rep;movsb' on
+      // non-AVX machines.
+      builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
+      offset += kThreeCachelines;
+    }
+  }
+  return builtin::Memcpy<32>::loop_and_tail_offset(dst, src, count, offset);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst,
+                                          CPtr __restrict src, size_t count) {
+  using namespace __llvm_libc::x86;
+  prefetch_to_local_cache(src + kOneCacheline);
+  if (count < 128)
+    return builtin::Memcpy<64>::head_tail(dst, src, count);
+  prefetch_to_local_cache(src + kTwoCachelines);
+  prefetch_to_local_cache(src + kThreeCachelines);
+  if (count < 256)
+    return builtin::Memcpy<128>::head_tail(dst, src, count);
+  // Aligning 'dst' on a 32B boundary.
+  builtin::Memcpy<32>::block(dst, src);
+  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+  builtin::Memcpy<224>::block(dst, src);
+  size_t offset = 224;
+  // At this point:
+  // - we copied between 224B and 256B,
+  // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196'
+  // - 'dst' is 32B aligned,
+  // - count >= 128.
+  while (offset + kThreeCachelines + 64 <= count) {
+    // Three cache lines at a time.
+    prefetch_to_local_cache(src + offset + kOneCacheline);
+    prefetch_to_local_cache(src + offset + kTwoCachelines);
+    prefetch_to_local_cache(src + offset + kThreeCachelines);
+    builtin::Memcpy<kThreeCachelines>::block_offset(dst, src, offset);
+    offset += kThreeCachelines;
+  }
+  return builtin::Memcpy<64>::loop_and_tail_offset(dst, src, count, offset);
+}
+
+[[maybe_unused]] LIBC_INLINE void
+inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
   if (count == 0)
     return;
   if (count == 1)
@@ -69,46 +167,30 @@ inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) {
     return builtin::Memcpy<16>::head_tail(dst, src, count);
   if (count < 64)
     return builtin::Memcpy<32>::head_tail(dst, src, count);
-  if (count < 128)
-    return builtin::Memcpy<64>::head_tail(dst, src, count);
-  builtin::Memcpy<32>::block(dst, src);
-  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
-  return builtin::Memcpy<32>::loop_and_tail(dst, src, count);
-}
-
-[[maybe_unused]] LIBC_INLINE void
-inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
-  if constexpr (x86::kAvx)
-    return inline_memcpy_x86_avx(dst, src, count);
-  else
-    return inline_memcpy_x86_no_avx(dst, src, count);
+  if constexpr (x86::kAvx) {
+    if constexpr (x86::kUseSoftwarePrefetching) {
+      return inline_memcpy_x86_avx_ge64_sw_prefetching(dst, src, count);
+    } else {
+      return inline_memcpy_x86_avx_ge64(dst, src, count);
+    }
+  } else {
+    if constexpr (x86::kUseSoftwarePrefetching) {
+      return inline_memcpy_x86_sse2_ge64_sw_prefetching(dst, src, count);
+    } else {
+      return inline_memcpy_x86_sse2_ge64(dst, src, count);
+    }
+  }
 }
 
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst,
                                            CPtr __restrict src, size_t count) {
-  // Whether to use rep;movsb exclusively, not at all, or only above a certain
-  // threshold.
-#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX
-#endif
-
-#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
-#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
-#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB
-
-#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead.
-#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-
-  static constexpr size_t kRepMovsbThreshold =
-      LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
-  if constexpr (kRepMovsbThreshold == 0) {
+  if constexpr (x86::kRepMovsbThreshold == 0) {
     return x86::Memcpy::repmovsb(dst, src, count);
-  } else if constexpr (kRepMovsbThreshold == SIZE_MAX) {
+  } else if constexpr (x86::kRepMovsbThreshold == SIZE_MAX) {
     return inline_memcpy_x86(dst, src, count);
   } else {
-    if (LIBC_UNLIKELY(count >= kRepMovsbThreshold))
+    if (LIBC_UNLIKELY(count >= x86::kRepMovsbThreshold))
       return x86::Memcpy::repmovsb(dst, src, count);
     else
       return inline_memcpy_x86(dst, src, count);

diff  --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f267d39da6b39e..7ace0fb45a660c 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -26,6 +26,11 @@ PRINTF_COPTS = [
     "LIBC_COPT_PRINTF_DISABLE_WRITE_INT",
 ]
 
+MEMORY_COPTS = [
+    # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
+    # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+]
+
 # A flag to pick which `mpfr` to use for math tests.
 # Usage: `-- at llvm-project//libc:mpfr=<disable|external|system>`.
 # Flag documentation: https://bazel.build/extending/config
@@ -448,10 +453,10 @@ libc_support_library(
         ":__support_cpp_limits",
         ":__support_cpp_optional",
         ":__support_ctype_utils",
+        ":__support_fputil_dyadic_float",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_fputil_rounding_mode",
-        ":__support_fputil_dyadic_float",
         ":__support_str_to_integer",
         ":__support_str_to_num_result",
         ":__support_uint128",
@@ -1147,8 +1152,8 @@ libc_support_library(
     hdrs = ["src/math/generic/log_range_reduction.h"],
     deps = [
         ":__support_common",
-        ":__support_uint128",
         ":__support_fputil_dyadic_float",
+        ":__support_uint128",
         ":common_constants",
     ],
 )
@@ -2002,6 +2007,7 @@ libc_support_library(
         "src/string/memory_utils/op_x86.h",
         "src/string/memory_utils/utils.h",
     ],
+    defines = MEMORY_COPTS,
     textual_hdrs = [
         "src/string/memory_utils/aarch64/memcmp_implementations.h",
         "src/string/memory_utils/aarch64/memcpy_implementations.h",


        


More information about the libc-commits mailing list