[llvm] [libc] [libc] Adding a version of memset with software prefetching (PR #70857)

via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 9 07:31:42 PST 2023


https://github.com/doshimili updated https://github.com/llvm/llvm-project/pull/70857

>From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 15:42:32 -0400
Subject: [PATCH 01/13] Sw prefetch in memset (#2)

* Add software prefetching to memset

* Add software prefetching to memset

* Fix formatting

* Fix build errors

* Fix build errors

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting
---
 libc/src/string/CMakeLists.txt                |  1 +
 libc/src/string/memory_utils/op_generic.h     | 26 +++++++
 .../memory_utils/x86_64/inline_memset.h       | 75 ++++++++++++-------
 .../llvm-project-overlay/libc/BUILD.bazel     |  1 +
 4 files changed, 78 insertions(+), 25 deletions(-)

diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_memset(memset_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
   add_memset(memset_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
   add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memset(memset)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..4063de1d5f5832a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
 using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
 } // namespace LIBC_NAMESPACE
 
+namespace LIBC_NAMESPACE::sw_prefetch {
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+} // namespace LIBC_NAMESPACE::sw_prefetch
+
 namespace LIBC_NAMESPACE::generic {
 
 // We accept three types of values as elements for generic operations:
@@ -163,6 +170,25 @@ template <typename T> struct Memset {
     } while (offset < count - SIZE);
     tail(dst, value, count);
   }
+
+  template <size_t prefetch_distance, size_t prefetch_degree>
+  LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+                                                 size_t count) {
+    size_t offset = 0;
+
+    while (offset + prefetch_degree + SIZE <= count) {
+      for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
+        sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
+                               sw_prefetch::kCachelineSize * i);
+      for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+        block(dst + offset, value);
+    }
+    while (offset + SIZE < count) {
+      block(dst + offset, value);
+      offset += SIZE;
+    }
+    tail(dst, value, count);
+  }
 };
 
 template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..4834968c0b99f38 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,9 +16,12 @@
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
+namespace x86 {
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+    LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
 
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
 #if defined(__AVX512F__)
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
@@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   using uint512_t = cpp::array<uint64_t, 8>;
 #endif
 
-  if (count == 0)
-    return;
-  if (count == 1)
-    return generic::Memset<uint8_t>::block(dst, value);
-  if (count == 2)
-    return generic::Memset<uint16_t>::block(dst, value);
-  if (count == 3)
-    return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
-  if (count <= 8)
-    return generic::Memset<uint32_t>::head_tail(dst, value, count);
-  if (count <= 16)
-    return generic::Memset<uint64_t>::head_tail(dst, value, count);
-  if (count <= 32)
-    return generic::Memset<uint128_t>::head_tail(dst, value, count);
-  if (count <= 64)
-    return generic::Memset<uint256_t>::head_tail(dst, value, count);
-  if (count <= 128)
-    return generic::Memset<uint512_t>::head_tail(dst, value, count);
-  // Aligned loop
-  generic::Memset<uint256_t>::block(dst, value);
-  align_to_next_boundary<32>(dst, count);
-  return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-}
+  [[maybe_unused]] LIBC_INLINE static void
+  inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+    sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
+    if (count <= 128)
+      return generic::Memset<uint512_t>::head_tail(dst, value, count);
+    sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
+    // Aligned loop
+    generic::Memset<uint256_t>::block(dst, value);
+    align_to_next_boundary<32>(dst, count);
+    if (count <= 192) {
+      return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+    } else {
+      return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
+          dst, value, count);
+    }
+  }
+
+  [[maybe_unused]] LIBC_INLINE static void
+  inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+    if (count == 0)
+      return;
+    if (count == 1)
+      return generic::Memset<uint8_t>::block(dst, value);
+    if (count == 2)
+      return generic::Memset<uint16_t>::block(dst, value);
+    if (count == 3)
+      return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+    if (count <= 8)
+      return generic::Memset<uint32_t>::head_tail(dst, value, count);
+    if (count <= 16)
+      return generic::Memset<uint64_t>::head_tail(dst, value, count);
+    if (count <= 32)
+      return generic::Memset<uint128_t>::head_tail(dst, value, count);
+    if (count <= 64)
+      return generic::Memset<uint256_t>::head_tail(dst, value, count);
+    if constexpr (x86::kUseSoftwarePrefetchingMemset) {
+      return inline_memset_x86_sw_prefetching(dst, value, count);
+    }
+    if (count <= 128)
+      return generic::Memset<uint512_t>::head_tail(dst, value, count);
+    // Aligned loop
+    generic::Memset<uint256_t>::block(dst, value);
+    align_to_next_boundary<32>(dst, count);
+    return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+  }
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
 MEMORY_COPTS = [
     # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
     # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+    # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
 ]
 
 # A flag to pick which `mpfr` to use for math tests.

>From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 17:05:57 -0400
Subject: [PATCH 02/13] Add software prefetch instructions to memset

* Add software prefetching to memset

* Add software prefetching to memset

* Fix formatting

* Fix build errors

* Fix build errors

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Add warmup to memset
---
 libc/src/string/memory_utils/op_generic.h           | 3 +--
 libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4063de1d5f5832a..2844501a7459044 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -174,8 +174,7 @@ template <typename T> struct Memset {
   template <size_t prefetch_distance, size_t prefetch_degree>
   LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
                                                  size_t count) {
-    size_t offset = 0;
-
+    size_t offset = 96;
     while (offset + prefetch_degree + SIZE <= count) {
       for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
         sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 4834968c0b99f38..98f559bca875a3a 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
 
   [[maybe_unused]] LIBC_INLINE static void
   inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+    // Prefetch one cacheline
     sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
     if (count <= 128)
       return generic::Memset<uint512_t>::head_tail(dst, value, count);
+    // Prefetch the next cacheline
     sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
     // Aligned loop
     generic::Memset<uint256_t>::block(dst, value);
@@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
     if (count <= 192) {
       return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
     } else {
+      generic::Memset<uint512_t>::block(dst, value);
+      generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
       return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
           dst, value, count);
     }

>From 50ffede6c6c40f2b97eca84d57ca9765ef552fd1 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 11:52:48 -0500
Subject: [PATCH 03/13] Move implementation to
 src/string/memory_utils/x86_64/inline_memset.h and other minor changes (#4)

* Add software prefetching to memset

* Add software prefetching to memset

* Fix formatting

* Fix build errors

* Fix build errors

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Add warmup to memset

* SW Prefetching in Memset

* Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes

* Fix formatting
---
 libc/src/string/memory_utils/op_generic.h     |  36 ++---
 libc/src/string/memory_utils/utils.h          |   6 +
 .../memory_utils/x86_64/inline_memcpy.h       |   5 -
 .../memory_utils/x86_64/inline_memset.h       | 132 ++++++++++--------
 4 files changed, 93 insertions(+), 86 deletions(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 2844501a7459044..833ab9a6624d679 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -141,19 +141,23 @@ template <typename T> struct Memset {
   static_assert(is_element_type_v<T>);
   static constexpr size_t SIZE = sizeof(T);
 
-  LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+  LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) {
     if constexpr (is_scalar_v<T> || is_vector_v<T>) {
-      store<T>(dst, splat<T>(value));
+      store<T>(dst + offset, splat<T>(value));
     } else if constexpr (is_array_v<T>) {
       using value_type = typename T::value_type;
       const auto Splat = splat<value_type>(value);
       for (size_t I = 0; I < array_size_v<T>; ++I)
-        store<value_type>(dst + (I * sizeof(value_type)), Splat);
+        store<value_type>(dst + offset + (I * sizeof(value_type)), Splat);
     }
   }
 
+  LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+    block_offset(dst, value, 0);
+  }
+
   LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
-    block(dst + count - SIZE, value);
+    block_offset(dst, value, count - SIZE);
   }
 
   LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) {
@@ -161,32 +165,18 @@ template <typename T> struct Memset {
     tail(dst, value, count);
   }
 
-  LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+  LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
+                                               size_t count, size_t offset) {
     static_assert(SIZE > 1, "a loop of size 1 does not need tail");
-    size_t offset = 0;
     do {
-      block(dst + offset, value);
+      block_offset(dst, value, offset);
       offset += SIZE;
     } while (offset < count - SIZE);
     tail(dst, value, count);
   }
 
-  template <size_t prefetch_distance, size_t prefetch_degree>
-  LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
-                                                 size_t count) {
-    size_t offset = 96;
-    while (offset + prefetch_degree + SIZE <= count) {
-      for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
-        sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
-                               sw_prefetch::kCachelineSize * i);
-      for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
-        block(dst + offset, value);
-    }
-    while (offset + SIZE < count) {
-      block(dst + offset, value);
-      offset += SIZE;
-    }
-    tail(dst, value, count);
+  LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    return loop_and_tail_offset(dst, value, count, 0);
   }
 };
 
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 85677e51fad0e09..62b3b7a0d728bd5 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,6 +374,12 @@ template <size_t SIZE> struct AlignHelper {
   uintptr_t offset_;
 };
 
+LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+
+LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
+  __builtin_prefetch(dst, 0, 3);
+}
+
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index f43230ffd8ad125..f851bcec09650d3 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
 
 } // namespace x86
 
-// TODO: Move to a shared header when appropriate.
-[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
-  __builtin_prefetch(addr, 0, 3);
-}
-
 [[maybe_unused]] LIBC_INLINE void
 inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
                             size_t count) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 98f559bca875a3a..b6d3d5a0b65cbb9 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -12,83 +12,99 @@
 #include "src/string/memory_utils/op_generic.h"
 #include "src/string/memory_utils/op_x86.h"
 #include "src/string/memory_utils/utils.h" // Ptr, CPtr
+#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
 
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
 namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+
 LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
     LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
 
 } // namespace x86
 
 #if defined(__AVX512F__)
-  using uint128_t = generic_v128;
-  using uint256_t = generic_v256;
-  using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
 #elif defined(__AVX__)
-  using uint128_t = generic_v128;
-  using uint256_t = generic_v256;
-  using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
 #elif defined(__SSE2__)
-  using uint128_t = generic_v128;
-  using uint256_t = cpp::array<generic_v128, 2>;
-  using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
 #else
-  using uint128_t = cpp::array<uint64_t, 2>;
-  using uint256_t = cpp::array<uint64_t, 4>;
-  using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
 #endif
 
-  [[maybe_unused]] LIBC_INLINE static void
-  inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
-    // Prefetch one cacheline
-    sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
-    if (count <= 128)
-      return generic::Memset<uint512_t>::head_tail(dst, value, count);
-    // Prefetch the next cacheline
-    sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
-    // Aligned loop
-    generic::Memset<uint256_t>::block(dst, value);
-    align_to_next_boundary<32>(dst, count);
-    if (count <= 192) {
-      return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-    } else {
-      generic::Memset<uint512_t>::block(dst, value);
-      generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
-      return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
-          dst, value, count);
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+  size_t prefetch_distance = x86::kFiveCachelinesSize;
+  size_t prefetch_degree = x86::kTwoCachelinesSize;
+  size_t SIZE = sizeof(uint256_t);
+  // Prefetch one cache line
+  prefetch_for_write(dst + x86::kOneCachelineSize);
+  if (count <= 128)
+    return generic::Memset<uint512_t>::head_tail(dst, value, count);
+  // Prefetch the second cache line
+  prefetch_for_write(dst + x86::kTwoCachelinesSize);
+  // Aligned loop
+  generic::Memset<uint256_t>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  if (count <= 192) {
+    return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+  } else {
+    generic::Memset<uint512_t>::block(dst, value);
+    generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
+    size_t offset = 96;
+    while (offset + prefetch_degree + SIZE <= count) {
+      for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
+        prefetch_for_write(dst + offset + prefetch_distance +
+                           x86::kOneCachelineSize * i);
+      for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+        generic::Memset<uint256_t>::block_offset(dst, value, offset);
     }
+    generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
   }
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return generic::Memset<uint8_t>::block(dst, value);
+  if (count == 2)
+    return generic::Memset<uint16_t>::block(dst, value);
+  if (count == 3)
+    return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+  if (count <= 8)
+    return generic::Memset<uint32_t>::head_tail(dst, value, count);
+  if (count <= 16)
+    return generic::Memset<uint64_t>::head_tail(dst, value, count);
+  if (count <= 32)
+    return generic::Memset<uint128_t>::head_tail(dst, value, count);
+  if (count <= 64)
+    return generic::Memset<uint256_t>::head_tail(dst, value, count);
+  if constexpr (x86::kUseSoftwarePrefetchingMemset)
+    return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
+  if (count <= 128)
+    return generic::Memset<uint512_t>::head_tail(dst, value, count);
+  // Aligned loop
+  generic::Memset<uint256_t>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+}
 
-  [[maybe_unused]] LIBC_INLINE static void
-  inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
-    if (count == 0)
-      return;
-    if (count == 1)
-      return generic::Memset<uint8_t>::block(dst, value);
-    if (count == 2)
-      return generic::Memset<uint16_t>::block(dst, value);
-    if (count == 3)
-      return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
-    if (count <= 8)
-      return generic::Memset<uint32_t>::head_tail(dst, value, count);
-    if (count <= 16)
-      return generic::Memset<uint64_t>::head_tail(dst, value, count);
-    if (count <= 32)
-      return generic::Memset<uint128_t>::head_tail(dst, value, count);
-    if (count <= 64)
-      return generic::Memset<uint256_t>::head_tail(dst, value, count);
-    if constexpr (x86::kUseSoftwarePrefetchingMemset) {
-      return inline_memset_x86_sw_prefetching(dst, value, count);
-    }
-    if (count <= 128)
-      return generic::Memset<uint512_t>::head_tail(dst, value, count);
-    // Aligned loop
-    generic::Memset<uint256_t>::block(dst, value);
-    align_to_next_boundary<32>(dst, count);
-    return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-  }
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H

>From fbb1f23c8e8e026178a6b2489307dbe9097298d5 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 19:16:56 +0000
Subject: [PATCH 06/13] Remove wrong include

---
 libc/src/string/memory_utils/x86_64/inline_memset.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index b6d3d5a0b65cbb9..9b92cd130bc60b4 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -12,7 +12,6 @@
 #include "src/string/memory_utils/op_generic.h"
 #include "src/string/memory_utils/op_x86.h"
 #include "src/string/memory_utils/utils.h" // Ptr, CPtr
-#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
 
 #include <stddef.h> // size_t
 

>From 9cd1f2350059cfd243c79edc95b5148b6299896c Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 20:27:50 +0000
Subject: [PATCH 07/13] Fix memset warmup

---
 libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 9b92cd130bc60b4..90e8104257703a4 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -63,7 +63,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
     return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
   } else {
     generic::Memset<uint512_t>::block(dst, value);
-    generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
+    generic::Memset<uint256_t>::block_offset(dst, value, sizeof(uint512_t));
     size_t offset = 96;
     while (offset + prefetch_degree + SIZE <= count) {
       for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)

>From 45e51a3aa663765b61cac68e177da738296accea Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 16:33:14 +0000
Subject: [PATCH 08/13] Remove block_offset and other minor changes

---
 libc/src/string/memory_utils/op_generic.h     | 21 +++++--------------
 libc/src/string/memory_utils/utils.h          |  6 ++++--
 .../memory_utils/x86_64/inline_memset.h       | 20 ++++++++----------
 3 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 833ab9a6624d679..db218f8577ab58d 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,13 +48,6 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
 using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
 } // namespace LIBC_NAMESPACE
 
-namespace LIBC_NAMESPACE::sw_prefetch {
-// Size of a cacheline for software prefetching
-static constexpr size_t kCachelineSize = 64;
-// prefetch for write
-static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-} // namespace LIBC_NAMESPACE::sw_prefetch
-
 namespace LIBC_NAMESPACE::generic {
 
 // We accept three types of values as elements for generic operations:
@@ -141,23 +134,19 @@ template <typename T> struct Memset {
   static_assert(is_element_type_v<T>);
   static constexpr size_t SIZE = sizeof(T);
 
-  LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) {
+  LIBC_INLINE static void block(Ptr dst, uint8_t value) {
     if constexpr (is_scalar_v<T> || is_vector_v<T>) {
-      store<T>(dst + offset, splat<T>(value));
+      store<T>(dst, splat<T>(value));
     } else if constexpr (is_array_v<T>) {
       using value_type = typename T::value_type;
       const auto Splat = splat<value_type>(value);
       for (size_t I = 0; I < array_size_v<T>; ++I)
-        store<value_type>(dst + offset + (I * sizeof(value_type)), Splat);
+        store<value_type>(dst + (I * sizeof(value_type)), Splat);
     }
   }
 
-  LIBC_INLINE static void block(Ptr dst, uint8_t value) {
-    block_offset(dst, value, 0);
-  }
-
   LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
-    block_offset(dst, value, count - SIZE);
+    block(dst + count - SIZE, value);
   }
 
   LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) {
@@ -169,7 +158,7 @@ template <typename T> struct Memset {
                                                size_t count, size_t offset) {
     static_assert(SIZE > 1, "a loop of size 1 does not need tail");
     do {
-      block_offset(dst, value, offset);
+      block(dst + offset, value);
       offset += SIZE;
     } while (offset < count - SIZE);
     tail(dst, value, count);
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 62b3b7a0d728bd5..f70880ee853d307 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,10 +374,12 @@ template <size_t SIZE> struct AlignHelper {
   uintptr_t offset_;
 };
 
-LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+LIBC_INLINE void prefetch_for_write(CPtr dst) {
+  __builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3);
+}
 
 LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
-  __builtin_prefetch(dst, 0, 3);
+  __builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 90e8104257703a4..9b95df663393535 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -47,9 +47,9 @@ using uint512_t = cpp::array<uint64_t, 8>;
 
 [[maybe_unused]] LIBC_INLINE static void
 inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
-  size_t prefetch_distance = x86::kFiveCachelinesSize;
-  size_t prefetch_degree = x86::kTwoCachelinesSize;
-  size_t SIZE = sizeof(uint256_t);
+  constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize;
+  constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize;
+  constexpr size_t SIZE = sizeof(uint256_t);
   // Prefetch one cache line
   prefetch_for_write(dst + x86::kOneCachelineSize);
   if (count <= 128)
@@ -62,15 +62,13 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
   if (count <= 192) {
     return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
   } else {
-    generic::Memset<uint512_t>::block(dst, value);
-    generic::Memset<uint256_t>::block_offset(dst, value, sizeof(uint512_t));
+    generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value);
     size_t offset = 96;
-    while (offset + prefetch_degree + SIZE <= count) {
-      for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
-        prefetch_for_write(dst + offset + prefetch_distance +
-                           x86::kOneCachelineSize * i);
-      for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
-        generic::Memset<uint256_t>::block_offset(dst, value, offset);
+    while (offset + PREFETCH_DEGREE + SIZE <= count) {
+      prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
+      prefetch_for_write(dst + offset + PREFETCH_DISTANCE + kOneCachelineSize);
+      for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
+        generic::Memset<uint256_t>::block(dst + offset, value);
     }
     generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
   }

>From 13adbd113d47dd18de72737a27c0251b9ac98513 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 16:42:38 +0000
Subject: [PATCH 09/13] Bug fixes

---
 libc/src/string/memory_utils/x86_64/inline_memset.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 9b95df663393535..c980a1cde7b36da 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -66,7 +66,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
     size_t offset = 96;
     while (offset + PREFETCH_DEGREE + SIZE <= count) {
       prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
-      prefetch_for_write(dst + offset + PREFETCH_DISTANCE + kOneCachelineSize);
+      prefetch_for_write(dst + offset + PREFETCH_DISTANCE + x86::kOneCachelineSize);
       for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
         generic::Memset<uint256_t>::block(dst + offset, value);
     }
@@ -101,7 +101,6 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   align_to_next_boundary<32>(dst, count);
   return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
 }
-
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H

>From c15871bdd519490249d49c96bac3c8c158147ccf Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 16:59:26 +0000
Subject: [PATCH 10/13] Formatting fixes

---
 libc/src/string/memory_utils/x86_64/inline_memset.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index c980a1cde7b36da..42559b6ffa9bf56 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -66,7 +66,8 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
     size_t offset = 96;
     while (offset + PREFETCH_DEGREE + SIZE <= count) {
       prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
-      prefetch_for_write(dst + offset + PREFETCH_DISTANCE + x86::kOneCachelineSize);
+      prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
+                           x86::kOneCachelineSize);
       for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
         generic::Memset<uint256_t>::block(dst + offset, value);
     }

>From c08bea25eea629ea2a25b2caf483c970bbe26969 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 17:20:44 +0000
Subject: [PATCH 11/13] Formatting fixes

---
 libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 42559b6ffa9bf56..41eadf2dcc00cc1 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -67,7 +67,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
     while (offset + PREFETCH_DEGREE + SIZE <= count) {
       prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
       prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
-                           x86::kOneCachelineSize);
+                         x86::kOneCachelineSize);
       for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
         generic::Memset<uint256_t>::block(dst + offset, value);
     }

>From a5ab2993207ac5bd789109f41d522ddb46216baa Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Thu, 9 Nov 2023 15:07:42 +0000
Subject: [PATCH 12/13] Add memset option to config.json

---
 libc/config/config.json | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/libc/config/config.json b/libc/config/config.json
index 3c74e0ed1eddf47..a7f2e5113d66010 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -21,6 +21,10 @@
     "LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
       "value": false,
       "doc": "Read more than a byte at a time to perform byte-string operations like strlen."
+    },
+    "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
+      "value": false,
+      "doc": "Use software prefetching in memset to increase performance."
     }
   }
 }

>From 9f594586d222515a8139f98a862107e7e65d416d Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Thu, 9 Nov 2023 15:31:14 +0000
Subject: [PATCH 13/13] Add configuration to CMakeLists

---
 libc/src/string/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index aa69bff7a8cfada..6daaf1998ea7bc2 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -3,6 +3,9 @@ add_subdirectory(memory_utils)
 if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
   list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
 endif()
+if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
+  list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")
+endif()
 if(string_config_options)
   list(PREPEND string_config_options "COMPILE_OPTIONS")
 endif()



More information about the llvm-commits mailing list