[libc-commits] [llvm] [libc] [libc] Adding a version of memset with software prefetching (PR #70857)
via libc-commits
libc-commits at lists.llvm.org
Tue Oct 31 14:13:05 PDT 2023
https://github.com/doshimili created https://github.com/llvm/llvm-project/pull/70857
Software prefetching helps recover performance when hardware prefetching is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile time option allows users to use this patch.
>From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 15:42:32 -0400
Subject: [PATCH 1/2] Sw prefetch in memset (#2)
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
---
libc/src/string/CMakeLists.txt | 1 +
libc/src/string/memory_utils/op_generic.h | 26 +++++++
.../memory_utils/x86_64/inline_memset.h | 75 ++++++++++++-------
.../llvm-project-overlay/libc/BUILD.bazel | 1 +
4 files changed, 78 insertions(+), 25 deletions(-)
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..4063de1d5f5832a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
+namespace LIBC_NAMESPACE::sw_prefetch {
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+} // namespace LIBC_NAMESPACE::sw_prefetch
+
namespace LIBC_NAMESPACE::generic {
// We accept three types of values as elements for generic operations:
@@ -163,6 +170,25 @@ template <typename T> struct Memset {
} while (offset < count - SIZE);
tail(dst, value, count);
}
+
+ template <size_t prefetch_distance, size_t prefetch_degree>
+ LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+ size_t count) {
+ size_t offset = 0;
+
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
+ sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
+ sw_prefetch::kCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+ block(dst + offset, value);
+ }
+ while (offset + SIZE < count) {
+ block(dst + offset, value);
+ offset += SIZE;
+ }
+ tail(dst, value, count);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..4834968c0b99f38 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,9 +16,12 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
using uint128_t = generic_v128;
using uint256_t = generic_v256;
@@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
using uint512_t = cpp::array<uint64_t, 8>;
#endif
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-}
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
+ dst, value, count);
+ }
+ }
+
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset) {
+ return inline_memset_x86_sw_prefetching(dst, value, count);
+ }
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+ # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]
# A flag to pick which `mpfr` to use for math tests.
>From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 17:05:57 -0400
Subject: [PATCH 2/2] Add software prefetch instructions to memset
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Add warmup to memset
---
libc/src/string/memory_utils/op_generic.h | 3 +--
libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4063de1d5f5832a..2844501a7459044 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -174,8 +174,7 @@ template <typename T> struct Memset {
template <size_t prefetch_distance, size_t prefetch_degree>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- size_t offset = 0;
-
+ size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 4834968c0b99f38..98f559bca875a3a 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ // Prefetch one cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the next cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
@@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
dst, value, count);
}
More information about the libc-commits
mailing list