[libc-commits] [libc] [llvm] [libc] Adding a version of memset with software prefetching (PR #70857)
via libc-commits
libc-commits at lists.llvm.org
Tue Nov 7 12:28:11 PST 2023
https://github.com/doshimili updated https://github.com/llvm/llvm-project/pull/70857
>From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 15:42:32 -0400
Subject: [PATCH 1/7] Sw prefetch in memset (#2)
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
---
libc/src/string/CMakeLists.txt | 1 +
libc/src/string/memory_utils/op_generic.h | 26 +++++++
.../memory_utils/x86_64/inline_memset.h | 75 ++++++++++++-------
.../llvm-project-overlay/libc/BUILD.bazel | 1 +
4 files changed, 78 insertions(+), 25 deletions(-)
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..4063de1d5f5832a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
+namespace LIBC_NAMESPACE::sw_prefetch {
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+} // namespace LIBC_NAMESPACE::sw_prefetch
+
namespace LIBC_NAMESPACE::generic {
// We accept three types of values as elements for generic operations:
@@ -163,6 +170,25 @@ template <typename T> struct Memset {
} while (offset < count - SIZE);
tail(dst, value, count);
}
+
+ template <size_t prefetch_distance, size_t prefetch_degree>
+ LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+ size_t count) {
+ size_t offset = 0;
+
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
+ sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
+ sw_prefetch::kCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+ block(dst + offset, value);
+ }
+ while (offset + SIZE < count) {
+ block(dst + offset, value);
+ offset += SIZE;
+ }
+ tail(dst, value, count);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..4834968c0b99f38 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,9 +16,12 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
using uint128_t = generic_v128;
using uint256_t = generic_v256;
@@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
using uint512_t = cpp::array<uint64_t, 8>;
#endif
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-}
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
+ dst, value, count);
+ }
+ }
+
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset) {
+ return inline_memset_x86_sw_prefetching(dst, value, count);
+ }
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+ # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]
# A flag to pick which `mpfr` to use for math tests.
>From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 17:05:57 -0400
Subject: [PATCH 2/7] Add software prefetch instructions to memset
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Add warmup to memset
---
libc/src/string/memory_utils/op_generic.h | 3 +--
libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4063de1d5f5832a..2844501a7459044 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -174,8 +174,7 @@ template <typename T> struct Memset {
template <size_t prefetch_distance, size_t prefetch_degree>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- size_t offset = 0;
-
+ size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 4834968c0b99f38..98f559bca875a3a 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ // Prefetch one cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the next cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
@@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
dst, value, count);
}
>From 50ffede6c6c40f2b97eca84d57ca9765ef552fd1 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 11:52:48 -0500
Subject: [PATCH 3/7] Move implementation to
src/string/memory_utils/x86_64/inline_memset.h and other minor changes (#4)
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Add warmup to memset
* SW Prefetching in Memset
* Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes
* Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 36 ++---
libc/src/string/memory_utils/utils.h | 6 +
.../memory_utils/x86_64/inline_memcpy.h | 5 -
.../memory_utils/x86_64/inline_memset.h | 132 ++++++++++--------
4 files changed, 93 insertions(+), 86 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 2844501a7459044..833ab9a6624d679 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -141,19 +141,23 @@ template <typename T> struct Memset {
static_assert(is_element_type_v<T>);
static constexpr size_t SIZE = sizeof(T);
- LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+ LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) {
if constexpr (is_scalar_v<T> || is_vector_v<T>) {
- store<T>(dst, splat<T>(value));
+ store<T>(dst + offset, splat<T>(value));
} else if constexpr (is_array_v<T>) {
using value_type = typename T::value_type;
const auto Splat = splat<value_type>(value);
for (size_t I = 0; I < array_size_v<T>; ++I)
- store<value_type>(dst + (I * sizeof(value_type)), Splat);
+ store<value_type>(dst + offset + (I * sizeof(value_type)), Splat);
}
}
+ LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+ block_offset(dst, value, 0);
+ }
+
LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
- block(dst + count - SIZE, value);
+ block_offset(dst, value, count - SIZE);
}
LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) {
@@ -161,32 +165,18 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
+ size_t count, size_t offset) {
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
- size_t offset = 0;
do {
- block(dst + offset, value);
+ block_offset(dst, value, offset);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, value, count);
}
- template <size_t prefetch_distance, size_t prefetch_degree>
- LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
- size_t count) {
- size_t offset = 96;
- while (offset + prefetch_degree + SIZE <= count) {
- for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
- sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
- sw_prefetch::kCachelineSize * i);
- for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
- block(dst + offset, value);
- }
- while (offset + SIZE < count) {
- block(dst + offset, value);
- offset += SIZE;
- }
- tail(dst, value, count);
+ LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ return loop_and_tail_offset(dst, value, count, 0);
}
};
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 85677e51fad0e09..62b3b7a0d728bd5 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,6 +374,12 @@ template <size_t SIZE> struct AlignHelper {
uintptr_t offset_;
};
+LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+
+LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
+ __builtin_prefetch(dst, 0, 3);
+}
+
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index f43230ffd8ad125..f851bcec09650d3 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
} // namespace x86
-// TODO: Move to a shared header when appropriate.
-[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
- __builtin_prefetch(addr, 0, 3);
-}
-
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 98f559bca875a3a..b6d3d5a0b65cbb9 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -12,83 +12,99 @@
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h" // Ptr, CPtr
+#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+
LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
} // namespace x86
#if defined(__AVX512F__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
#elif defined(__AVX__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
- using uint128_t = generic_v128;
- using uint256_t = cpp::array<generic_v128, 2>;
- using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
#else
- using uint128_t = cpp::array<uint64_t, 2>;
- using uint256_t = cpp::array<uint64_t, 4>;
- using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
#endif
- [[maybe_unused]] LIBC_INLINE static void
- inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- // Prefetch one cacheline
- sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Prefetch the next cacheline
- sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- if (count <= 192) {
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
- } else {
- generic::Memset<uint512_t>::block(dst, value);
- generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
- return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
- dst, value, count);
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ size_t prefetch_distance = x86::kFiveCachelinesSize;
+ size_t prefetch_degree = x86::kTwoCachelinesSize;
+ size_t SIZE = sizeof(uint256_t);
+ // Prefetch one cache line
+ prefetch_for_write(dst + x86::kOneCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the second cache line
+ prefetch_for_write(dst + x86::kTwoCachelinesSize);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
+ size_t offset = 96;
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
+ prefetch_for_write(dst + offset + prefetch_distance +
+ x86::kOneCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+ generic::Memset<uint256_t>::block_offset(dst, value, offset);
}
+ generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
}
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset)
+ return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+}
- [[maybe_unused]] LIBC_INLINE static void
- inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if constexpr (x86::kUseSoftwarePrefetchingMemset) {
- return inline_memset_x86_sw_prefetching(dst, value, count);
- }
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
- }
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
>From fbb1f23c8e8e026178a6b2489307dbe9097298d5 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 19:16:56 +0000
Subject: [PATCH 6/7] Remove wrong include
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 1 -
1 file changed, 1 deletion(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index b6d3d5a0b65cbb9..9b92cd130bc60b4 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -12,7 +12,6 @@
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h" // Ptr, CPtr
-#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
#include <stddef.h> // size_t
>From 9cd1f2350059cfd243c79edc95b5148b6299896c Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 7 Nov 2023 20:27:50 +0000
Subject: [PATCH 7/7] Fix memset warmup
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 9b92cd130bc60b4..90e8104257703a4 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -63,7 +63,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
generic::Memset<uint512_t>::block(dst, value);
- generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
+ generic::Memset<uint256_t>::block_offset(dst, value, sizeof(uint512_t));
size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
More information about the libc-commits
mailing list