[libc-commits] [llvm] [libc] [libc] Adding a version of memset with software prefetching (PR #70857)

via libc-commits libc-commits at lists.llvm.org
Tue Oct 31 14:13:05 PDT 2023


https://github.com/doshimili created https://github.com/llvm/llvm-project/pull/70857

Software prefetching helps recover performance when hardware prefetching is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile time option allows users to use this patch.

>From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 15:42:32 -0400
Subject: [PATCH 1/2] Sw prefetch in memset (#2)

* Add software prefetching to memset

* Add software prefetching to memset

* Fix formatting

* Fix build errors

* Fix build errors

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting
---
 libc/src/string/CMakeLists.txt                |  1 +
 libc/src/string/memory_utils/op_generic.h     | 26 +++++++
 .../memory_utils/x86_64/inline_memset.h       | 75 ++++++++++++-------
 .../llvm-project-overlay/libc/BUILD.bazel     |  1 +
 4 files changed, 78 insertions(+), 25 deletions(-)

diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_memset(memset_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
   add_memset(memset_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
   add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memset(memset)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..4063de1d5f5832a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
 using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
 } // namespace LIBC_NAMESPACE
 
+namespace LIBC_NAMESPACE::sw_prefetch {
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+} // namespace LIBC_NAMESPACE::sw_prefetch
+
 namespace LIBC_NAMESPACE::generic {
 
 // We accept three types of values as elements for generic operations:
@@ -163,6 +170,25 @@ template <typename T> struct Memset {
     } while (offset < count - SIZE);
     tail(dst, value, count);
   }
+
+  template <size_t prefetch_distance, size_t prefetch_degree>
+  LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+                                                 size_t count) {
+    size_t offset = 0;
+
+    while (offset + prefetch_degree + SIZE <= count) {
+      for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
+        sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
+                               sw_prefetch::kCachelineSize * i);
+      for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+        block(dst + offset, value);
+    }
+    while (offset + SIZE < count) {
+      block(dst + offset, value);
+      offset += SIZE;
+    }
+    tail(dst, value, count);
+  }
 };
 
 template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..4834968c0b99f38 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,9 +16,12 @@
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
+namespace x86 {
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+    LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
 
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
 #if defined(__AVX512F__)
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
@@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   using uint512_t = cpp::array<uint64_t, 8>;
 #endif
 
-  if (count == 0)
-    return;
-  if (count == 1)
-    return generic::Memset<uint8_t>::block(dst, value);
-  if (count == 2)
-    return generic::Memset<uint16_t>::block(dst, value);
-  if (count == 3)
-    return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
-  if (count <= 8)
-    return generic::Memset<uint32_t>::head_tail(dst, value, count);
-  if (count <= 16)
-    return generic::Memset<uint64_t>::head_tail(dst, value, count);
-  if (count <= 32)
-    return generic::Memset<uint128_t>::head_tail(dst, value, count);
-  if (count <= 64)
-    return generic::Memset<uint256_t>::head_tail(dst, value, count);
-  if (count <= 128)
-    return generic::Memset<uint512_t>::head_tail(dst, value, count);
-  // Aligned loop
-  generic::Memset<uint256_t>::block(dst, value);
-  align_to_next_boundary<32>(dst, count);
-  return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-}
+  [[maybe_unused]] LIBC_INLINE static void
+  inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+    sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
+    if (count <= 128)
+      return generic::Memset<uint512_t>::head_tail(dst, value, count);
+    sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
+    // Aligned loop
+    generic::Memset<uint256_t>::block(dst, value);
+    align_to_next_boundary<32>(dst, count);
+    if (count <= 192) {
+      return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+    } else {
+      return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
+          dst, value, count);
+    }
+  }
+
+  [[maybe_unused]] LIBC_INLINE static void
+  inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+    if (count == 0)
+      return;
+    if (count == 1)
+      return generic::Memset<uint8_t>::block(dst, value);
+    if (count == 2)
+      return generic::Memset<uint16_t>::block(dst, value);
+    if (count == 3)
+      return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+    if (count <= 8)
+      return generic::Memset<uint32_t>::head_tail(dst, value, count);
+    if (count <= 16)
+      return generic::Memset<uint64_t>::head_tail(dst, value, count);
+    if (count <= 32)
+      return generic::Memset<uint128_t>::head_tail(dst, value, count);
+    if (count <= 64)
+      return generic::Memset<uint256_t>::head_tail(dst, value, count);
+    if constexpr (x86::kUseSoftwarePrefetchingMemset) {
+      return inline_memset_x86_sw_prefetching(dst, value, count);
+    }
+    if (count <= 128)
+      return generic::Memset<uint512_t>::head_tail(dst, value, count);
+    // Aligned loop
+    generic::Memset<uint256_t>::block(dst, value);
+    align_to_next_boundary<32>(dst, count);
+    return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+  }
 } // namespace LIBC_NAMESPACE
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
 MEMORY_COPTS = [
     # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
     # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+    # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
 ]
 
 # A flag to pick which `mpfr` to use for math tests.

>From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 17:05:57 -0400
Subject: [PATCH 2/2] Add software prefetch instructions to memset

* Add software prefetching to memset

* Add software prefetching to memset

* Fix formatting

* Fix build errors

* Fix build errors

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Fix formatting

* Add warmup to memset
---
 libc/src/string/memory_utils/op_generic.h           | 3 +--
 libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4063de1d5f5832a..2844501a7459044 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -174,8 +174,7 @@ template <typename T> struct Memset {
   template <size_t prefetch_distance, size_t prefetch_degree>
   LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
                                                  size_t count) {
-    size_t offset = 0;
-
+    size_t offset = 96;
     while (offset + prefetch_degree + SIZE <= count) {
       for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
         sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 4834968c0b99f38..98f559bca875a3a 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
 
   [[maybe_unused]] LIBC_INLINE static void
   inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+    // Prefetch one cacheline
     sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
     if (count <= 128)
       return generic::Memset<uint512_t>::head_tail(dst, value, count);
+    // Prefetch the next cacheline
     sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
     // Aligned loop
     generic::Memset<uint256_t>::block(dst, value);
@@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
     if (count <= 192) {
       return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
     } else {
+      generic::Memset<uint512_t>::block(dst, value);
+      generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
       return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
           dst, value, count);
     }



More information about the libc-commits mailing list