[libc-commits] [llvm] [libc] [libc] Adding a version of memset with software prefetching (PR #70857)
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Wed Nov 8 06:55:23 PST 2023
@@ -16,27 +16,68 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+} // namespace x86
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
#elif defined(__AVX__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
- using uint128_t = generic_v128;
- using uint256_t = cpp::array<generic_v128, 2>;
- using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
- using uint128_t = cpp::array<uint64_t, 2>;
- using uint256_t = cpp::array<uint64_t, 4>;
- using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ size_t prefetch_distance = x86::kFiveCachelinesSize;
+ size_t prefetch_degree = x86::kTwoCachelinesSize;
+ size_t SIZE = sizeof(uint256_t);
+ // Prefetch one cache line
+ prefetch_for_write(dst + x86::kOneCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the second cache line
+ prefetch_for_write(dst + x86::kTwoCachelinesSize);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block_offset(dst, value, sizeof(uint512_t));
+ size_t offset = 96;
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
+ prefetch_for_write(dst + offset + prefetch_distance +
+ x86::kOneCachelineSize * i);
gchatelet wrote:
It might be clearer to unroll the loop as I did in `memcpy` since everything is constant.
The for loop is literally just:
prefetch_for_write(dst + offset + prefetch_distance);
prefetch_for_write(dst + offset + prefetch_distance + x86::kOneCachelineSize);
which is clearer in my opinion.
More information about the libc-commits
mailing list