[libc-commits] [libc] 3153aa4 - [libc] Adding a version of memset with software prefetching (#70857)
via libc-commits
libc-commits at lists.llvm.org
Fri Nov 10 01:56:20 PST 2023
Author: doshimili
Date: 2023-11-10T10:56:16+01:00
New Revision: 3153aa4c959a65ac8e7f911ffa04b7ccb4641bb5
URL: https://github.com/llvm/llvm-project/commit/3153aa4c959a65ac8e7f911ffa04b7ccb4641bb5
DIFF: https://github.com/llvm/llvm-project/commit/3153aa4c959a65ac8e7f911ffa04b7ccb4641bb5.diff
LOG: [libc] Adding a version of memset with software prefetching (#70857)
Software prefetching helps recover performance when hardware prefetching
is disabled. The 'LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING' compile
time option allows users to use this patch.
Added:
Modified:
libc/config/config.json
libc/src/string/CMakeLists.txt
libc/src/string/memory_utils/op_generic.h
libc/src/string/memory_utils/utils.h
libc/src/string/memory_utils/x86_64/inline_memcpy.h
libc/src/string/memory_utils/x86_64/inline_memset.h
utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Removed:
################################################################################
diff --git a/libc/config/config.json b/libc/config/config.json
index 3c74e0ed1eddf47..77d10d75f364679 100644
--- a/libc/config/config.json
+++ b/libc/config/config.json
@@ -21,6 +21,10 @@
"LIBC_CONF_STRING_UNSAFE_WIDE_READ": {
"value": false,
"doc": "Read more than a byte at a time to perform byte-string operations like strlen."
+ },
+ "LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING": {
+ "value": false,
+ "doc": "Inserts prefetch for write instructions (PREFETCHW) for memset on x86 to recover performance when hardware prefetcher is disabled."
}
}
}
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..6daaf1998ea7bc2 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -3,6 +3,9 @@ add_subdirectory(memory_utils)
if(LIBC_CONF_STRING_UNSAFE_WIDE_READ)
list(APPEND string_config_options "-DLIBC_COPT_STRING_UNSAFE_WIDE_READ")
endif()
+if(LIBC_CONF_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
+ list(APPEND string_config_options "-DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING")
+endif()
if(string_config_options)
list(PREPEND string_config_options "COMPILE_OPTIONS")
endif()
@@ -656,6 +659,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..db218f8577ab58d 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -154,15 +154,19 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
+ size_t count, size_t offset) {
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
- size_t offset = 0;
do {
block(dst + offset, value);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, value, count);
}
+
+ LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ return loop_and_tail_offset(dst, value, count, 0);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 85677e51fad0e09..f70880ee853d307 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,6 +374,14 @@ template <size_t SIZE> struct AlignHelper {
uintptr_t offset_;
};
+LIBC_INLINE void prefetch_for_write(CPtr dst) {
+ __builtin_prefetch(dst, /*write*/ 1, /*max locality*/ 3);
+}
+
+LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
+ __builtin_prefetch(dst, /*read*/ 0, /*max locality*/ 3);
+}
+
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index a1e2985fc1f89ed..dd09d4f3e812b0c 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
} // namespace x86
-// TODO: Move to a shared header when appropriate.
-[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
- __builtin_prefetch(addr, 0, 3);
-}
-
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..41eadf2dcc00cc1 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,27 +16,67 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
#elif defined(__AVX__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
- using uint128_t = generic_v128;
- using uint256_t = cpp::array<generic_v128, 2>;
- using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
#else
- using uint128_t = cpp::array<uint64_t, 2>;
- using uint256_t = cpp::array<uint64_t, 4>;
- using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
#endif
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ constexpr size_t PREFETCH_DISTANCE = x86::kFiveCachelinesSize;
+ constexpr size_t PREFETCH_DEGREE = x86::kTwoCachelinesSize;
+ constexpr size_t SIZE = sizeof(uint256_t);
+ // Prefetch one cache line
+ prefetch_for_write(dst + x86::kOneCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the second cache line
+ prefetch_for_write(dst + x86::kTwoCachelinesSize);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value);
+ size_t offset = 96;
+ while (offset + PREFETCH_DEGREE + SIZE <= count) {
+ prefetch_for_write(dst + offset + PREFETCH_DISTANCE);
+ prefetch_for_write(dst + offset + PREFETCH_DISTANCE +
+ x86::kOneCachelineSize);
+ for (size_t i = 0; i < PREFETCH_DEGREE; i += SIZE, offset += SIZE)
+ generic::Memset<uint256_t>::block(dst + offset, value);
+ }
+ generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
+ }
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
if (count == 0)
return;
if (count == 1)
@@ -53,6 +93,8 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset)
+ return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
// Aligned loop
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 612d5a87eaf73e2..d9252208e9b8d55 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -32,6 +32,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+ # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]
# A flag to pick which `mpfr` to use for math tests.
More information about the libc-commits
mailing list