[libc-commits] [libc] 3153aa4 - [libc] Adding a version of memset with software prefetching (#70857)

via libc-commits libc-commits at lists.llvm.org
Fri Nov 10 01:56:20 PST 2023


Author: doshimili
Date: 2023-11-10T10:56:16+01:00
New Revision: 3153aa4c959a65ac8e7f911ffa04b7ccb4641bb5

URL: https://github.com/llvm/llvm-project/commit/3153aa4c959a65ac8e7f911ffa04b7ccb4641bb5
DIFF: https://github.com/llvm/llvm-project/commit/3153aa4c959a65ac8e7f911ffa04b7ccb4641bb5.diff

LOG: [libc] Adding a version of memset with software prefetching (#70857)

Software prefetching helps recover performance when hardware prefetching
is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile
time option allows users to use this patch.

Added: 
    

Modified: 
    libc/config/config.json
    libc/src/string/CMakeLists.txt
    libc/src/string/memory_utils/op_generic.h
    libc/src/string/memory_utils/utils.h
    libc/src/string/memory_utils/x86_64/inline_memcpy.h
    libc/src/string/memory_utils/x86_64/inline_memset.h
    utils/bazel/llvm-project-overlay/libc/BUILD.bazel

Removed: 
    


################################################################################
diff  --git a/libc/config/config.json b/libc/config/config.json
index 3c74e0ed1eddf47..77d10d75f364679 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -21,6 +21,10 @@
     "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
       "value": false,
       "doc": "Read more than a byte at a time to perform byte-string operations like strlen."
+    },
+    "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
+      "value": false,
+      "doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled."
     }
   }
 }

diff  --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..6daaf1998ea7bc2 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -3,6 +3,9 @@ add_subdirectory(memory_utils)
 if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
   list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
 endif()
+if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
+  list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")
+endif()
 if(string_config_options)
   list(PREPEND string_config_options "COMPILE_OPTIONS")
 endif()
@@ -656,6 +659,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_memset(memset_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
   add_memset(memset_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
   add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memset(memset)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})

diff  --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..db218f8577ab58d 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -154,15 +154,19 @@ template <typename T> struct Memset {
     tail(dst, value, count);
   }
 
-  LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+  LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
+                                               size_t count, size_t offset) {
     static_assert(SIZE > 1, "a loop of size 1 does not need tail");
-    size_t offset = 0;
     do {
       block(dst + offset, value);
       offset += SIZE;
     } while (offset < count - SIZE);
     tail(dst, value, count);
   }
+
+  LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    return loop_and_tail_offset(dst, value, count, 0);
+  }
 };
 
 template <typename T, typename... TS> struct MemsetSequence {

diff  --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 85677e51fad0e09..f70880ee853d307 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,6 +374,14 @@ template <size_t SIZE> struct AlignHelper {
   uintptr_t offset_;
 };
 
+LIBC_INLINE void prefetch_for_write(CPtr dst) {
+  __builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3);
+}
+
+LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
+  __builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3);
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H

diff  --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index a1e2985fc1f89ed..dd09d4f3e812b0c 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
 
 } // namespace x86
 
-// TODO: Move to a shared header when appropriate.
-[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
-  __builtin_prefetch(addr, 0, 3);
-}
-
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
                             size_t count) {

diff  --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..41eadf2dcc00cc1 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,27 +16,67 @@
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
+namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+    LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
 
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
 #if defined(__AVX512F__)
-  using uint128_t = generic_v128;
-  using uint256_t = generic_v256;
-  using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
 #elif defined(__AVX__)
-  using uint128_t = generic_v128;
-  using uint256_t = generic_v256;
-  using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
 #elif defined(__SSE2__)
-  using uint128_t = generic_v128;
-  using uint256_t = cpp::array<generic_v128, 2>;
-  using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
 #else
-  using uint128_t = cpp::array<uint64_t, 2>;
-  using uint256_t = cpp::array<uint64_t, 4>;
-  using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
 #endif
 
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+  constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize;
+  constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize;
+  constexpr size_t SIZE = sizeof(uint256_t);
+  // Prefetch one cache line
+  prefetch_for_write(dst + x86::kOneCachelineSize);
+  if (count <= 128)
+    return generic::Memset<uint512_t>::head_tail(dst, value, count);
+  // Prefetch the second cache line
+  prefetch_for_write(dst + x86::kTwoCachelinesSize);
+  // Aligned loop
+  generic::Memset<uint256_t>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  if (count <= 192) {
+    return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+  } else {
+    generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value);
+    size_t offset = 96;
+    while (offset + PREFETCH_DEGREE + SIZE <= count) {
+      prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
+      prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
+                         x86::kOneCachelineSize);
+      for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
+        generic::Memset<uint256_t>::block(dst + offset, value);
+    }
+    generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
+  }
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   if (count == 0)
     return;
   if (count == 1)
@@ -53,6 +93,8 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
     return generic::Memset<uint128_t>::head_tail(dst, value, count);
   if (count <= 64)
     return generic::Memset<uint256_t>::head_tail(dst, value, count);
+  if constexpr (x86::kUseSoftwarePrefetchingMemset)
+    return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
   if (count <= 128)
     return generic::Memset<uint512_t>::head_tail(dst, value, count);
   // Aligned loop

diff  --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 612d5a87eaf73e2..d9252208e9b8d55 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -32,6 +32,7 @@ PRINTF_COPTS = [
 MEMORY_COPTS = [
     # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
     # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+    # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
 ]
 
 # A flag to pick which `mpfr` to use for math tests.


        


More information about the libc-commits mailing list