[clang-tools-extra] [libc] Adding a version of memset with software prefetching (PR #70493)
via cfe-commits
cfe-commits at lists.llvm.org
Fri Oct 27 11:58:36 PDT 2023
https://github.com/doshimili created https://github.com/llvm/llvm-project/pull/70493
Software prefetching helps recover performance when hardware prefetching is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile time option allows users to use this patch.
>From 6c313955185c0d59564f6535b6f1580dca168bea Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 24 Oct 2023 21:15:23 +0000
Subject: [PATCH 1/3] Add software prefetching to memset
---
libc/src/string/memory_utils/op_generic.h | 19 +++++++++++++++++++
.../memory_utils/x86_64/inline_memset.h | 12 +++++++++++-
2 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..54af7ea10e25e46 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -163,6 +163,25 @@ template <typename T> struct Memset {
} while (offset < count - SIZE);
tail(dst, value, count);
}
+
+ template <size_t prefetch_distance, size_t prefetch_degree>
+ LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+ size_t count) {
+ Memset<uint512_t>::block(dst, value);
+ Memset<uint256_t>::block(dst + 64, value);
+ size_t offset = 96;
+ while (offset + prefetch_degree + kSize <= count) {
+ for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
+ PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize)
+ block(dst + offset, value);
+ }
+ while (offset + kSize < count) {
+ block(dst + offset, value);
+ offset += kSize;
+ }
+ tail(dst, value, count);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..da463bc0029f9aa 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -17,6 +17,11 @@
namespace LIBC_NAMESPACE {
+static constexpr size_t kCachelineSize = 64;
+
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
@@ -53,12 +58,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ PrefetchW(dst + kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ PrefetchW(dst + kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ if (count <= 192) {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
+ return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
}
} // namespace LIBC_NAMESPACE
>From 15cbd0a0c851fa3ac5315e796bb69c1bf791e956 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 24 Oct 2023 21:15:23 +0000
Subject: [PATCH 2/3] Add software prefetching to memset
---
libc/src/string/CMakeLists.txt | 1 +
.../memory_utils/x86_64/inline_memset.h | 32 ++++++++++++++++---
.../llvm-project-overlay/libc/BUILD.bazel | 1 +
3 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index da463bc0029f9aa..f3ad04930c52c64 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,12 +16,34 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
static constexpr size_t kCachelineSize = 64;
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
+
// prefetch for write
static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ PrefetchW(dst + kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ PrefetchW(dst + kCachelineSize * 2);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
+ else {
+ return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
+ }
+}
+
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
@@ -58,17 +80,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
- PrefetchW(dst + kCachelineSize);
+ if constexpr (x86::kUseSoftwarePrefetching) {
+ return inline_memset_x86_sw_prefetching(dst, value, count);
+ }
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
- PrefetchW(dst + kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
- if (count <= 192) {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ else {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
}
- return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
}
} // namespace LIBC_NAMESPACE
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+ # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]
# A flag to pick which `mpfr` to use for math tests.
>From abb9debc49b7e171eae14a98320b9a49779c808c Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Fri, 27 Oct 2023 17:55:47 +0000
Subject: [PATCH 3/3] Fix formatting
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index f3ad04930c52c64..e82b600bf66ab96 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -28,7 +28,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
// prefetch for write
static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
PrefetchW(dst + kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
@@ -38,9 +39,9 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
align_to_next_boundary<32>(dst, count);
if (count <= 192) {
return Memset<uint256_t>::loop_and_tail(dst, value, count);
- }
- else {
- return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
+ } else {
+ return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value,
+ count);
}
}
@@ -89,7 +90,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
else {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
}
}
} // namespace LIBC_NAMESPACE
More information about the cfe-commits
mailing list