[clang] [libc] Adding a version of memset with software prefetching (PR #70493)

via cfe-commits cfe-commits at lists.llvm.org
Mon Oct 30 10:21:57 PDT 2023


https://github.com/doshimili updated https://github.com/llvm/llvm-project/pull/70493

>From 6c313955185c0d59564f6535b6f1580dca168bea Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 24 Oct 2023 21:15:23 +0000
Subject: [PATCH 1/5] Add software prefetching to memset

---
 libc/src/string/memory_utils/op_generic.h     | 19 +++++++++++++++++++
 .../memory_utils/x86_64/inline_memset.h       | 12 +++++++++++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..54af7ea10e25e46 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -163,6 +163,25 @@ template <typename T> struct Memset {
     } while (offset < count - SIZE);
     tail(dst, value, count);
   }
+
+  template <size_t prefetch_distance, size_t prefetch_degree>
+  LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+                                                 size_t count) {
+    Memset<uint512_t>::block(dst, value);
+    Memset<uint256_t>::block(dst + 64, value);
+    size_t offset = 96;
+    while (offset + prefetch_degree + kSize <= count) {
+      for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
+        PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
+      for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize)
+        block(dst + offset, value);
+    }
+    while (offset + kSize < count) {
+      block(dst + offset, value);
+      offset += kSize;
+    }
+    tail(dst, value, count);
+  }
 };
 
 template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..da463bc0029f9aa 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -17,6 +17,11 @@
 
 namespace LIBC_NAMESPACE {
 
+static constexpr size_t kCachelineSize = 64;
+
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+
 [[maybe_unused]] LIBC_INLINE static void
 inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
 #if defined(__AVX512F__)
@@ -53,12 +58,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
     return generic::Memset<uint128_t>::head_tail(dst, value, count);
   if (count <= 64)
     return generic::Memset<uint256_t>::head_tail(dst, value, count);
+  PrefetchW(dst + kCachelineSize);
   if (count <= 128)
     return generic::Memset<uint512_t>::head_tail(dst, value, count);
+  PrefetchW(dst + kCachelineSize * 2);
   // Aligned loop
   generic::Memset<uint256_t>::block(dst, value);
   align_to_next_boundary<32>(dst, count);
-  return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+  if (count <= 192) {
+    return Memset<uint256_t>::loop_and_tail(dst, value, count);
+  }
+  return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
 }
 } // namespace LIBC_NAMESPACE
 

>From 15cbd0a0c851fa3ac5315e796bb69c1bf791e956 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 24 Oct 2023 21:15:23 +0000
Subject: [PATCH 2/5] Add software prefetching to memset

---
 libc/src/string/CMakeLists.txt                |  1 +
 .../memory_utils/x86_64/inline_memset.h       | 32 ++++++++++++++++---
 .../llvm-project-overlay/libc/BUILD.bazel     |  1 +
 3 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
   add_memset(memset_x86_64_opt_sse4   COMPILE_OPTIONS -march=nehalem        REQUIRE SSE4_2)
   add_memset(memset_x86_64_opt_avx2   COMPILE_OPTIONS -march=haswell        REQUIRE AVX2)
   add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+  add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
   add_memset(memset_opt_host          COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
   add_memset(memset)
 elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index da463bc0029f9aa..f3ad04930c52c64 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,12 +16,34 @@
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
+namespace x86 {
 
 static constexpr size_t kCachelineSize = 64;
 
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+    LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
+
 // prefetch for write
 static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
 
+[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+  PrefetchW(dst + kCachelineSize);
+  if (count <= 128)
+    return generic::Memset<uint512_t>::head_tail(dst, value, count);
+  PrefetchW(dst + kCachelineSize * 2);
+  // Aligned loop
+  generic::Memset<uint256_t>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  if (count <= 192) {
+    return Memset<uint256_t>::loop_and_tail(dst, value, count);
+  }
+  else {
+    return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
+  }
+}
+
 [[maybe_unused]] LIBC_INLINE static void
 inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
 #if defined(__AVX512F__)
@@ -58,17 +80,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
     return generic::Memset<uint128_t>::head_tail(dst, value, count);
   if (count <= 64)
     return generic::Memset<uint256_t>::head_tail(dst, value, count);
-  PrefetchW(dst + kCachelineSize);
+  if constexpr (x86::kUseSoftwarePrefetching) {
+    return inline_memset_x86_sw_prefetching(dst, value, count);
+  }
   if (count <= 128)
     return generic::Memset<uint512_t>::head_tail(dst, value, count);
-  PrefetchW(dst + kCachelineSize * 2);
   // Aligned loop
   generic::Memset<uint256_t>::block(dst, value);
   align_to_next_boundary<32>(dst, count);
-  if (count <= 192) {
-    return Memset<uint256_t>::loop_and_tail(dst, value, count);
+  else {
+    return Memset<uint256_t>::loop_and_tail(dst, value, count); 
   }
-  return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
 }
 } // namespace LIBC_NAMESPACE
 
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
 MEMORY_COPTS = [
     # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
     # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+    # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
 ]
 
 # A flag to pick which `mpfr` to use for math tests.

>From abb9debc49b7e171eae14a98320b9a49779c808c Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Fri, 27 Oct 2023 17:55:47 +0000
Subject: [PATCH 3/5] Fix formatting

---
 libc/src/string/memory_utils/x86_64/inline_memset.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index f3ad04930c52c64..e82b600bf66ab96 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -28,7 +28,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
 // prefetch for write
 static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
 
-[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
   PrefetchW(dst + kCachelineSize);
   if (count <= 128)
     return generic::Memset<uint512_t>::head_tail(dst, value, count);
@@ -38,9 +39,9 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
   align_to_next_boundary<32>(dst, count);
   if (count <= 192) {
     return Memset<uint256_t>::loop_and_tail(dst, value, count);
-  }
-  else {
-    return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
+  } else {
+    return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value,
+                                                               count);
   }
 }
 
@@ -89,7 +90,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   generic::Memset<uint256_t>::block(dst, value);
   align_to_next_boundary<32>(dst, count);
   else {
-    return Memset<uint256_t>::loop_and_tail(dst, value, count); 
+    return Memset<uint256_t>::loop_and_tail(dst, value, count);
   }
 }
 } // namespace LIBC_NAMESPACE

>From 2155db70066c2c220160c4178bd73237e1372d45 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 14:53:56 +0000
Subject: [PATCH 4/5] Fix build errors

---
 libc/src/string/memory_utils/op_generic.h         | 15 +++++++++------
 .../string/memory_utils/x86_64/inline_memset.h    |  7 ++-----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 54af7ea10e25e46..4ba137c97ec9a9a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -87,6 +87,9 @@ template <class T, size_t N>
 struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {};
 template <typename T> constexpr size_t array_size_v = array_size<T>::value;
 
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+
 // Generic operations for the above type categories.
 
 template <typename T> T load(CPtr src) {
@@ -167,18 +170,18 @@ template <typename T> struct Memset {
   template <size_t prefetch_distance, size_t prefetch_degree>
   LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
                                                  size_t count) {
-    Memset<uint512_t>::block(dst, value);
-    Memset<uint256_t>::block(dst + 64, value);
+    Memset<64>::block(dst, value);
+    Memset<32>::block(dst + 64, value);
     size_t offset = 96;
-    while (offset + prefetch_degree + kSize <= count) {
+    while (offset + prefetch_degree + SIZE <= count) {
       for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
         PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
-      for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize)
+      for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
         block(dst + offset, value);
     }
-    while (offset + kSize < count) {
+    while (offset + SIZE < count) {
       block(dst + offset, value);
-      offset += kSize;
+      offset += SIZE;
     }
     tail(dst, value, count);
   }
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index e82b600bf66ab96..fca48e9658a752d 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -17,9 +17,6 @@
 
 namespace LIBC_NAMESPACE {
 namespace x86 {
-
-static constexpr size_t kCachelineSize = 64;
-
 LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
     LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
 
@@ -30,10 +27,10 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
 
 [[maybe_unused]] LIBC_INLINE static void
 inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
-  PrefetchW(dst + kCachelineSize);
+  PrefetchW(dst + generic::kCachelineSize);
   if (count <= 128)
     return generic::Memset<uint512_t>::head_tail(dst, value, count);
-  PrefetchW(dst + kCachelineSize * 2);
+  PrefetchW(dst + generic::kCachelineSize * 2);
   // Aligned loop
   generic::Memset<uint256_t>::block(dst, value);
   align_to_next_boundary<32>(dst, count);

>From 9fe0041c2bb8ba1d522538c79ac1ebae7d0632bb Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 17:09:15 +0000
Subject: [PATCH 5/5] Fix build errors

---
 libc/src/string/memory_utils/op_generic.h     | 15 +++---
 .../memory_utils/x86_64/inline_memset.h       | 50 +++++++++----------
 2 files changed, 33 insertions(+), 32 deletions(-)

diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4ba137c97ec9a9a..12eeb65a1edc52e 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
 using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
 } // namespace LIBC_NAMESPACE
 
+namespace sw_prefetch {
+  // Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+  // prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+}
+
 namespace LIBC_NAMESPACE::generic {
 
 // We accept three types of values as elements for generic operations:
@@ -87,9 +94,6 @@ template <class T, size_t N>
 struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {};
 template <typename T> constexpr size_t array_size_v = array_size<T>::value;
 
-// Size of a cacheline for software prefetching
-static constexpr size_t kCachelineSize = 64;
-
 // Generic operations for the above type categories.
 
 template <typename T> T load(CPtr src) {
@@ -167,12 +171,9 @@ template <typename T> struct Memset {
     tail(dst, value, count);
   }
 
-  template <size_t prefetch_distance, size_t prefetch_degree>
+  template <size_t prefetch_distance, size_t prefetch_degree, size_t offset>
   LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
                                                  size_t count) {
-    Memset<64>::block(dst, value);
-    Memset<32>::block(dst + 64, value);
-    size_t offset = 96;
     while (offset + prefetch_degree + SIZE <= count) {
       for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
         PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index fca48e9658a752d..bc7a6162f77b9cd 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -17,33 +17,11 @@
 
 namespace LIBC_NAMESPACE {
 namespace x86 {
-LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
     LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
 
 } // namespace x86
 
-// prefetch for write
-static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
-  PrefetchW(dst + generic::kCachelineSize);
-  if (count <= 128)
-    return generic::Memset<uint512_t>::head_tail(dst, value, count);
-  PrefetchW(dst + generic::kCachelineSize * 2);
-  // Aligned loop
-  generic::Memset<uint256_t>::block(dst, value);
-  align_to_next_boundary<32>(dst, count);
-  if (count <= 192) {
-    return Memset<uint256_t>::loop_and_tail(dst, value, count);
-  } else {
-    return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value,
-                                                               count);
-  }
-}
-
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
 #if defined(__AVX512F__)
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
@@ -62,6 +40,28 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   using uint512_t = cpp::array<uint64_t, 8>;
 #endif
 
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+  sw_prefetch::PrefetchW(dst + generic::kCachelineSize);
+  if (count <= 128)
+    return generic::Memset<uint512_t>::head_tail(dst, value, count);
+  sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2);
+  // Aligned loop
+  generic::Memset<uint256_t>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  if (count <= 192) {
+    return Memset<uint256_t>::loop_and_tail(dst, value, count);
+  } else {
+    // Warm up memset
+    generic::Memset<uint256_t>::block(dst, value);
+    generic::Memset<uint128_t>::block(dst + 64, value);
+    return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>(dst, value,
+                                                               count);
+  }
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   if (count == 0)
     return;
   if (count == 1)
@@ -78,7 +78,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
     return generic::Memset<uint128_t>::head_tail(dst, value, count);
   if (count <= 64)
     return generic::Memset<uint256_t>::head_tail(dst, value, count);
-  if constexpr (x86::kUseSoftwarePrefetching) {
+  if constexpr (x86::kUseSoftwarePrefetchingMemset) {
     return inline_memset_x86_sw_prefetching(dst, value, count);
   }
   if (count <= 128)
@@ -87,7 +87,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
   generic::Memset<uint256_t>::block(dst, value);
   align_to_next_boundary<32>(dst, count);
   else {
-    return Memset<uint256_t>::loop_and_tail(dst, value, count);
+    return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
   }
 }
 } // namespace LIBC_NAMESPACE



More information about the cfe-commits mailing list