[libc-commits] [llvm] [libc] [libc] Adding a version of memset with software prefetching (PR #70857)
via libc-commits
libc-commits at lists.llvm.org
Thu Nov 9 07:38:25 PST 2023
https://github.com/doshimili updated https://github.com/llvm/llvm-project/pull/70857
>From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 15:42:32 -0400
Subject: [PATCH 01/14] Sw prefetch in memset (#2)
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
---
libc/src/string/CMakeLists.txt | 1 +
libc/src/string/memory_utils/op_generic.h | 26 +++++++
.../memory_utils/x86_64/inline_memset.h | 75 ++++++++++++-------
.../llvm-project-overlay/libc/BUILD.bazel | 1 +
4 files changed, 78 insertions(+), 25 deletions(-)
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..4063de1d5f5832a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
+namespace LIBC_NAMESPACE::sw_prefetch {
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+} // namespace LIBC_NAMESPACE::sw_prefetch
+
namespace LIBC_NAMESPACE::generic {
// We accept three types of values as elements for generic operations:
@@ -163,6 +170,25 @@ template <typename T> struct Memset {
} while (offset < count - SIZE);
tail(dst, value, count);
}
+
+ template <size_t prefetch_distance, size_t prefetch_degree>
+ LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+ size_t count) {
+ size_t offset = 0;
+
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
+ sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
+ sw_prefetch::kCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+ block(dst + offset, value);
+ }
+ while (offset + SIZE < count) {
+ block(dst + offset, value);
+ offset += SIZE;
+ }
+ tail(dst, value, count);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..4834968c0b99f38 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,9 +16,12 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
using uint128_t = generic_v128;
using uint256_t = generic_v256;
@@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
using uint512_t = cpp::array<uint64_t, 8>;
#endif
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-}
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
+ dst, value, count);
+ }
+ }
+
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset) {
+ return inline_memset_x86_sw_prefetching(dst, value, count);
+ }
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+ # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]
# A flag to pick which `mpfr` to use for math tests.
>From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 17:05:57 -0400
Subject: [PATCH 02/14] Add software prefetch instructions to memset
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Add warmup to memset
---
libc/src/string/memory_utils/op_generic.h | 3 +--
libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4063de1d5f5832a..2844501a7459044 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -174,8 +174,7 @@ template <typename T> struct Memset {
template <size_t prefetch_distance, size_t prefetch_degree>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- size_t offset = 0;
-
+ size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 4834968c0b99f38..98f559bca875a3a 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ // Prefetch one cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the next cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
@@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
dst, value, count);
}
>From 50ffede6c6c40f2b97eca84d57ca9765ef552fd1 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 11:52:48 -0500
Subject: [PATCH 03/14] Move implementation to
src/string/memory_utils/x86_64/inline_memset.h and other minor changes (#4)
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Add warmup to memset
* SW Prefetching in Memset
* Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes
* Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 36 ++---
libc/src/string/memory_utils/utils.h | 6 +
.../memory_utils/x86_64/inline_memcpy.h | 5 -
.../memory_utils/x86_64/inline_memset.h | 132 ++++++++++--------
4 files changed, 93 insertions(+), 86 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 2844501a7459044..833ab9a6624d679 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -141,19 +141,23 @@ template <typename T> struct Memset {
static_assert(is_element_type_v<T>);
static constexpr size_t SIZE = sizeof(T);
- LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+ LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) {
if constexpr (is_scalar_v<T> || is_vector_v<T>) {
- store<T>(dst, splat<T>(value));
+ store<T>(dst + offset, splat<T>(value));
} else if constexpr (is_array_v<T>) {
using value_type = typename T::value_type;
const auto Splat = splat<value_type>(value);
for (size_t I = 0; I < array_size_v<T>; ++I)
- store<value_type>(dst + (I * sizeof(value_type)), Splat);
+ store<value_type>(dst + offset + (I * sizeof(value_type)), Splat);
}
}
+ LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+ block_offset(dst, value, 0);
+ }
+
LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
- block(dst + count - SIZE, value);
+ block_offset(dst, value, count - SIZE);
}
LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) {
@@ -161,32 +165,18 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
+ size_t count, size_t offset) {
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
- size_t offset = 0;
do {
- block(dst + offset, value);
+ block_offset(dst, value, offset);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, value, count);
}
- template <size_t prefetch_distance, size_t prefetch_degree>
- LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
- size_t count) {
- size_t offset = 96;
- while (offset + prefetch_degree + SIZE <= count) {
- for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
- sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
- sw_prefetch::kCachelineSize * i);
- for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
- block(dst + offset, value);
- }
- while (offset + SIZE < count) {
- block(dst + offset, value);
- offset += SIZE;
- }
- tail(dst, value, count);
+ LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ return loop_and_tail_offset(dst, value, count, 0);
}
};
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 85677e51fad0e09..62b3b7a0d728bd5 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,6 +374,12 @@ template <size_t SIZE> struct AlignHelper {
uintptr_t offset_;
};
+LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+
+LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
+ __builtin_prefetch(dst, 0, 3);
+}
+
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index f43230ffd8ad125..f851bcec09650d3 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
} // namespace x86
-// TODO: Move to a shared header when appropriate.
-[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
- __builtin_prefetch(addr, 0, 3);
-}
-
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 98f559bca875a3a..b6d3d5a0b65cbb9 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -12,83 +12,99 @@
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h" // Ptr, CPtr
+#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+
LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
} // namespace x86
#if defined(__AVX512F__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
#elif defined(__AVX__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
- using uint128_t = generic_v128;
- using uint256_t = cpp::array<generic_v128, 2>;
- using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
#else
- using uint128_t = cpp::array<uint64_t, 2>;
- using uint256_t = cpp::array<uint64_t, 4>;
- using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
#endif
- [[maybe_unused]] LIBC_INLINE static void
- inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- // Prefetch one cacheline
- sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Prefetch the next cacheline
- sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- if (count <= 192) {
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
- } else {
- generic::Memset<uint512_t>::block(dst, value);
- generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
- return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
- dst, value, count);
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ size_t prefetch_distance = x86::kFiveCachelinesSize;
+ size_t prefetch_degree = x86::kTwoCachelinesSize;
+ size_t SIZE = sizeof(uint256_t);
+ // Prefetch one cache line
+ prefetch_for_write(dst + x86::kOneCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the second cache line
+ prefetch_for_write(dst + x86::kTwoCachelinesSize);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
+ size_t offset = 96;
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
+ prefetch_for_write(dst + offset + prefetch_distance +
+ x86::kOneCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+ generic::Memset<uint256_t>::block_offset(dst, value, offset);
}
+ generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
}
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset)
+ return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+}
- [[maybe_unused]] LIBC_INLINE static void
- inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if constexpr (x86::kUseSoftwarePrefetchingMemset) {
- return inline_memset_x86_sw_prefetching(dst, value, count);
- }
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
- }
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
>From fbb1f23c8e8e026178a6b2489307dbe9097298d5 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 19:16:56 +0000
Subject: [PATCH 06/14] Remove wrong include
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index b6d3d5a0b65cbb9..9b92cd130bc60b4 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -12,7 +12,6 @@
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h" // Ptr, CPtr
-#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
#include <stddef.h> // size_t
>From 9cd1f2350059cfd243c79edc95b5148b6299896c Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 20:27:50 +0000
Subject: [PATCH 07/14] Fix memset warmup
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 9b92cd130bc60b4..90e8104257703a4 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -63,7 +63,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
generic::Memset<uint512_t>::block(dst, value);
- generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
+ generic::Memset<uint256_t>::block_offset(dst, value, sizeof(uint512_t));
size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
>From 45e51a3aa663765b61cac68e177da738296accea Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 16:33:14 +0000
Subject: [PATCH 08/14] Remove block_offset and other minor changes
---
libc/src/string/memory_utils/op_generic.h | 21 +++++--------------
libc/src/string/memory_utils/utils.h | 6 ++++--
.../memory_utils/x86_64/inline_memset.h | 20 ++++++++----------
3 files changed, 18 insertions(+), 29 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 833ab9a6624d679..db218f8577ab58d 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,13 +48,6 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
-namespace LIBC_NAMESPACE::sw_prefetch {
-// Size of a cacheline for software prefetching
-static constexpr size_t kCachelineSize = 64;
-// prefetch for write
-static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-} // namespace LIBC_NAMESPACE::sw_prefetch
-
namespace LIBC_NAMESPACE::generic {
// We accept three types of values as elements for generic operations:
@@ -141,23 +134,19 @@ template <typename T> struct Memset {
static_assert(is_element_type_v<T>);
static constexpr size_t SIZE = sizeof(T);
- LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) {
+ LIBC_INLINE static void block(Ptr dst, uint8_t value) {
if constexpr (is_scalar_v<T> || is_vector_v<T>) {
- store<T>(dst + offset, splat<T>(value));
+ store<T>(dst, splat<T>(value));
} else if constexpr (is_array_v<T>) {
using value_type = typename T::value_type;
const auto Splat = splat<value_type>(value);
for (size_t I = 0; I < array_size_v<T>; ++I)
- store<value_type>(dst + offset + (I * sizeof(value_type)), Splat);
+ store<value_type>(dst + (I * sizeof(value_type)), Splat);
}
}
- LIBC_INLINE static void block(Ptr dst, uint8_t value) {
- block_offset(dst, value, 0);
- }
-
LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
- block_offset(dst, value, count - SIZE);
+ block(dst + count - SIZE, value);
}
LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) {
@@ -169,7 +158,7 @@ template <typename T> struct Memset {
size_t count, size_t offset) {
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
do {
- block_offset(dst, value, offset);
+ block(dst + offset, value);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, value, count);
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 62b3b7a0d728bd5..f70880ee853d307 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,10 +374,12 @@ template <size_t SIZE> struct AlignHelper {
uintptr_t offset_;
};
-LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+LIBC_INLINE void prefetch_for_write(CPtr dst) {
+ __builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3);
+}
LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
- __builtin_prefetch(dst, 0, 3);
+ __builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3);
}
} // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 90e8104257703a4..9b95df663393535 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -47,9 +47,9 @@ using uint512_t = cpp::array<uint64_t, 8>;
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- size_t prefetch_distance = x86::kFiveCachelinesSize;
- size_t prefetch_degree = x86::kTwoCachelinesSize;
- size_t SIZE = sizeof(uint256_t);
+ constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize;
+ constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize;
+ constexpr size_t SIZE = sizeof(uint256_t);
// Prefetch one cache line
prefetch_for_write(dst + x86::kOneCachelineSize);
if (count <= 128)
@@ -62,15 +62,13 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
- generic::Memset<uint512_t>::block(dst, value);
- generic::Memset<uint256_t>::block_offset(dst, value, sizeof(uint512_t));
+ generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value);
size_t offset = 96;
- while (offset + prefetch_degree + SIZE <= count) {
- for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
- prefetch_for_write(dst + offset + prefetch_distance +
- x86::kOneCachelineSize * i);
- for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
- generic::Memset<uint256_t>::block_offset(dst, value, offset);
+ while (offset + PREFETCH_DEGREE + SIZE <= count) {
+ prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
+ prefetch_for_write(dst + offset + PREFETCH_DISTANCE + kOneCachelineSize);
+ for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
+ generic::Memset<uint256_t>::block(dst + offset, value);
}
generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
}
>From 13adbd113d47dd18de72737a27c0251b9ac98513 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 16:42:38 +0000
Subject: [PATCH 09/14] Bug fixes
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 9b95df663393535..c980a1cde7b36da 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -66,7 +66,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
size_t offset = 96;
while (offset + PREFETCH_DEGREE + SIZE <= count) {
prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
- prefetch_for_write(dst + offset + PREFETCH_DISTANCE + kOneCachelineSize);
+ prefetch_for_write(dst + offset + PREFETCH_DISTANCE + x86::kOneCachelineSize);
for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
generic::Memset<uint256_t>::block(dst + offset, value);
}
@@ -101,7 +101,6 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
align_to_next_boundary<32>(dst, count);
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
}
-
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
>From c15871bdd519490249d49c96bac3c8c158147ccf Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 16:59:26 +0000
Subject: [PATCH 10/14] Formatting fixes
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index c980a1cde7b36da..42559b6ffa9bf56 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -66,7 +66,8 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
size_t offset = 96;
while (offset + PREFETCH_DEGREE + SIZE <= count) {
prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
- prefetch_for_write(dst + offset + PREFETCH_DISTANCE + x86::kOneCachelineSize);
+ prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
+ x86::kOneCachelineSize);
for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
generic::Memset<uint256_t>::block(dst + offset, value);
}
>From c08bea25eea629ea2a25b2caf483c970bbe26969 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Wed, 8 Nov 2023 17:20:44 +0000
Subject: [PATCH 11/14] Formatting fixes
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 42559b6ffa9bf56..41eadf2dcc00cc1 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -67,7 +67,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
while (offset + PREFETCH_DEGREE + SIZE <= count) {
prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
- x86::kOneCachelineSize);
+ x86::kOneCachelineSize);
for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
generic::Memset<uint256_t>::block(dst + offset, value);
}
>From a5ab2993207ac5bd789109f41d522ddb46216baa Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Thu, 9 Nov 2023 15:07:42 +0000
Subject: [PATCH 12/14] Add memset option to config.json
---
libc/config/config.json | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/libc/config/config.json b/libc/config/config.json
index 3c74e0ed1eddf47..a7f2e5113d66010 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -21,6 +21,10 @@
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
"value": false,
"doc": "Read more than a byte at a time to perform byte-string operations like strlen."
+ },
+ "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
+ "value": false,
+ "doc": "Use software prefetching in memset to increase performance."
}
}
}
>From 9f594586d222515a8139f98a862107e7e65d416d Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Thu, 9 Nov 2023 15:31:14 +0000
Subject: [PATCH 13/14] Add configuration to CMakeLists
---
libc/src/string/CMakeLists.txt | 3 +++
1 file changed, 3 insertions(+)
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index aa69bff7a8cfada..6daaf1998ea7bc2 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -3,6 +3,9 @@ add_subdirectory(memory_utils)
if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
endif()
+if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
+ list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")
+endif()
if(string_config_options)
list(PREPEND string_config_options "COMPILE_OPTIONS")
endif()
>From b7e154774d54f5c8c6c73d6db9e7c81e232499b0 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Thu, 9 Nov 2023 15:38:06 +0000
Subject: [PATCH 14/14] Modify docstring to describe the configuration better
---
libc/config/config.json | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/config/config.json b/libc/config/config.json
index a7f2e5113d66010..77d10d75f364679 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -24,7 +24,7 @@
},
"LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
"value": false,
- "doc": "Use software prefetching in memset to increase performance."
+ "doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled."
}
}
}
More information about the libc-commits
mailing list