[flang-commits] [libc] [compiler-rt] [clang-tools-extra] [clang] [flang] [libcxx] [llvm] Move implementation to src/string/memory_utils/x86_64/inline_memset.h and other minor changes (PR #71558)
via flang-commits
flang-commits at lists.llvm.org
Tue Nov 7 08:38:34 PST 2023
https://github.com/doshimili updated https://github.com/llvm/llvm-project/pull/71558
>From 6c313955185c0d59564f6535b6f1580dca168bea Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 24 Oct 2023 21:15:23 +0000
Subject: [PATCH 01/16] Add software prefetching to memset
---
libc/src/string/memory_utils/op_generic.h | 19 +++++++++++++++++++
.../memory_utils/x86_64/inline_memset.h | 12 +++++++++++-
2 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..54af7ea10e25e46 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -163,6 +163,25 @@ template <typename T> struct Memset {
} while (offset < count - SIZE);
tail(dst, value, count);
}
+
+ template <size_t prefetch_distance, size_t prefetch_degree>
+ LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+ size_t count) {
+ Memset<uint512_t>::block(dst, value);
+ Memset<uint256_t>::block(dst + 64, value);
+ size_t offset = 96;
+ while (offset + prefetch_degree + kSize <= count) {
+ for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
+ PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize)
+ block(dst + offset, value);
+ }
+ while (offset + kSize < count) {
+ block(dst + offset, value);
+ offset += kSize;
+ }
+ tail(dst, value, count);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..da463bc0029f9aa 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -17,6 +17,11 @@
namespace LIBC_NAMESPACE {
+static constexpr size_t kCachelineSize = 64;
+
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
@@ -53,12 +58,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ PrefetchW(dst + kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ PrefetchW(dst + kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ if (count <= 192) {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
+ return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
}
} // namespace LIBC_NAMESPACE
>From 15cbd0a0c851fa3ac5315e796bb69c1bf791e956 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 24 Oct 2023 21:15:23 +0000
Subject: [PATCH 02/16] Add software prefetching to memset
---
libc/src/string/CMakeLists.txt | 1 +
.../memory_utils/x86_64/inline_memset.h | 32 ++++++++++++++++---
.../llvm-project-overlay/libc/BUILD.bazel | 1 +
3 files changed, 29 insertions(+), 5 deletions(-)
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index da463bc0029f9aa..f3ad04930c52c64 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,12 +16,34 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
static constexpr size_t kCachelineSize = 64;
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
+
// prefetch for write
static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ PrefetchW(dst + kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ PrefetchW(dst + kCachelineSize * 2);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
+ else {
+ return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
+ }
+}
+
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
@@ -58,17 +80,17 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
- PrefetchW(dst + kCachelineSize);
+ if constexpr (x86::kUseSoftwarePrefetching) {
+ return inline_memset_x86_sw_prefetching(dst, value, count);
+ }
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
- PrefetchW(dst + kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
- if (count <= 192) {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ else {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
}
- return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
}
} // namespace LIBC_NAMESPACE
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+ # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]
# A flag to pick which `mpfr` to use for math tests.
>From abb9debc49b7e171eae14a98320b9a49779c808c Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Fri, 27 Oct 2023 17:55:47 +0000
Subject: [PATCH 03/16] Fix formatting
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index f3ad04930c52c64..e82b600bf66ab96 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -28,7 +28,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
// prefetch for write
static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-[[maybe_unused]] LIBC_INLINE static void inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
PrefetchW(dst + kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
@@ -38,9 +39,9 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
align_to_next_boundary<32>(dst, count);
if (count <= 192) {
return Memset<uint256_t>::loop_and_tail(dst, value, count);
- }
- else {
- return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value, count);
+ } else {
+ return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value,
+ count);
}
}
@@ -89,7 +90,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
else {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
}
}
} // namespace LIBC_NAMESPACE
>From 2155db70066c2c220160c4178bd73237e1372d45 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 14:53:56 +0000
Subject: [PATCH 04/16] Fix build errors
---
libc/src/string/memory_utils/op_generic.h | 15 +++++++++------
.../string/memory_utils/x86_64/inline_memset.h | 7 ++-----
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 54af7ea10e25e46..4ba137c97ec9a9a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -87,6 +87,9 @@ template <class T, size_t N>
struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {};
template <typename T> constexpr size_t array_size_v = array_size<T>::value;
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+
// Generic operations for the above type categories.
template <typename T> T load(CPtr src) {
@@ -167,18 +170,18 @@ template <typename T> struct Memset {
template <size_t prefetch_distance, size_t prefetch_degree>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- Memset<uint512_t>::block(dst, value);
- Memset<uint256_t>::block(dst + 64, value);
+ Memset<64>::block(dst, value);
+ Memset<32>::block(dst + 64, value);
size_t offset = 96;
- while (offset + prefetch_degree + kSize <= count) {
+ while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
- for (size_t i = 0; i < prefetch_degree; i += kSize, offset += kSize)
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
block(dst + offset, value);
}
- while (offset + kSize < count) {
+ while (offset + SIZE < count) {
block(dst + offset, value);
- offset += kSize;
+ offset += SIZE;
}
tail(dst, value, count);
}
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index e82b600bf66ab96..fca48e9658a752d 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -17,9 +17,6 @@
namespace LIBC_NAMESPACE {
namespace x86 {
-
-static constexpr size_t kCachelineSize = 64;
-
LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
@@ -30,10 +27,10 @@ static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- PrefetchW(dst + kCachelineSize);
+ PrefetchW(dst + generic::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
- PrefetchW(dst + kCachelineSize * 2);
+ PrefetchW(dst + generic::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
>From 9fe0041c2bb8ba1d522538c79ac1ebae7d0632bb Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 17:09:15 +0000
Subject: [PATCH 05/16] Fix build errors
---
libc/src/string/memory_utils/op_generic.h | 15 +++---
.../memory_utils/x86_64/inline_memset.h | 50 +++++++++----------
2 files changed, 33 insertions(+), 32 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4ba137c97ec9a9a..12eeb65a1edc52e 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
+namespace sw_prefetch {
+ // Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+ // prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+}
+
namespace LIBC_NAMESPACE::generic {
// We accept three types of values as elements for generic operations:
@@ -87,9 +94,6 @@ template <class T, size_t N>
struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {};
template <typename T> constexpr size_t array_size_v = array_size<T>::value;
-// Size of a cacheline for software prefetching
-static constexpr size_t kCachelineSize = 64;
-
// Generic operations for the above type categories.
template <typename T> T load(CPtr src) {
@@ -167,12 +171,9 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- template <size_t prefetch_distance, size_t prefetch_degree>
+ template <size_t prefetch_distance, size_t prefetch_degree, size_t offset>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- Memset<64>::block(dst, value);
- Memset<32>::block(dst + 64, value);
- size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index fca48e9658a752d..bc7a6162f77b9cd 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -17,33 +17,11 @@
namespace LIBC_NAMESPACE {
namespace x86 {
-LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching =
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
} // namespace x86
-// prefetch for write
-static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- PrefetchW(dst + generic::kCachelineSize);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- PrefetchW(dst + generic::kCachelineSize * 2);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- if (count <= 192) {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
- } else {
- return Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(dst, value,
- count);
- }
-}
-
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
using uint128_t = generic_v128;
using uint256_t = generic_v256;
@@ -62,6 +40,28 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
using uint512_t = cpp::array<uint64_t, 8>;
#endif
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ sw_prefetch::PrefetchW(dst + generic::kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ // Warm up memset
+ generic::Memset<uint256_t>::block(dst, value);
+ generic::Memset<uint128_t>::block(dst + 64, value);
+ return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>(dst, value,
+ count);
+ }
+}
+
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
if (count == 0)
return;
if (count == 1)
@@ -78,7 +78,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
return generic::Memset<uint128_t>::head_tail(dst, value, count);
if (count <= 64)
return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if constexpr (x86::kUseSoftwarePrefetching) {
+ if constexpr (x86::kUseSoftwarePrefetchingMemset) {
return inline_memset_x86_sw_prefetching(dst, value, count);
}
if (count <= 128)
@@ -87,7 +87,7 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
else {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
}
}
} // namespace LIBC_NAMESPACE
>From 52aad858a4a8652f95a3e1120e4dd7bd2f45d225 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 17:39:04 +0000
Subject: [PATCH 06/16] Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 6 +-
.../memory_utils/x86_64/inline_memset.h | 92 +++++++++----------
2 files changed, 49 insertions(+), 49 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 12eeb65a1edc52e..af6a814be1542a4 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -49,11 +49,11 @@ using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
namespace sw_prefetch {
- // Size of a cacheline for software prefetching
+// Size of a cacheline for software prefetching
static constexpr size_t kCachelineSize = 64;
- // prefetch for write
+// prefetch for write
static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-}
+} // namespace sw_prefetch
namespace LIBC_NAMESPACE::generic {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index bc7a6162f77b9cd..9000aa03019d291 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -40,56 +40,56 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
using uint512_t = cpp::array<uint64_t, 8>;
#endif
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- sw_prefetch::PrefetchW(dst + generic::kCachelineSize);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- if (count <= 192) {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
- } else {
- // Warm up memset
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ sw_prefetch::PrefetchW(dst + generic::kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2);
+ // Aligned loop
generic::Memset<uint256_t>::block(dst, value);
- generic::Memset<uint128_t>::block(dst + 64, value);
- return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>(dst, value,
- count);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ // Warm up memset
+ generic::Memset<uint256_t>::block(dst, value);
+ generic::Memset<uint128_t>::block(dst + 64, value);
+ return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>(
+ dst, value, count);
+ }
}
-}
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if constexpr (x86::kUseSoftwarePrefetchingMemset) {
- return inline_memset_x86_sw_prefetching(dst, value, count);
- }
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- else {
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset) {
+ return inline_memset_x86_sw_prefetching(dst, value, count);
+ }
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ else {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
}
-}
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
>From efbfcd19cecbd3e27b72523d06d8cff3a5bbbafa Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 19:12:19 +0000
Subject: [PATCH 07/16] Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 24 +++++++++++--------
.../memory_utils/x86_64/inline_memset.h | 6 ++---
2 files changed, 17 insertions(+), 13 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index af6a814be1542a4..ae221e0fa380655 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,12 +48,12 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
-namespace sw_prefetch {
+namespace LIBC_NAMESPACE::sw_prefetch {
// Size of a cacheline for software prefetching
static constexpr size_t kCachelineSize = 64;
// prefetch for write
static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-} // namespace sw_prefetch
+} // namespace LIBC_NAMESPACE::sw_prefetch
namespace LIBC_NAMESPACE::generic {
@@ -174,15 +174,19 @@ template <typename T> struct Memset {
template <size_t prefetch_distance, size_t prefetch_degree, size_t offset>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- while (offset + prefetch_degree + SIZE <= count) {
- for (size_t i = 0; i < prefetch_degree / kCachelineSize; ++i)
- PrefetchW(dst + offset + prefetch_distance + kCachelineSize * i);
- for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
- block(dst + offset, value);
+ size_t prefetch_offset = offset;
+
+ while (prefetch_offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
+ PrefetchW(dst + prefetch_offset + prefetch_distance +
+ sw_prefetch::kCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree;
+ i += SIZE, prefetch_offset += SIZE)
+ block(dst + prefetch_offset, value);
}
- while (offset + SIZE < count) {
- block(dst + offset, value);
- offset += SIZE;
+ while (prefetch_offset + SIZE < count) {
+ block(dst + prefetch_offset, value);
+ prefetch_offset += SIZE;
}
tail(dst, value, count);
}
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 9000aa03019d291..fc00f86fc0fb34e 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,15 +42,15 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- sw_prefetch::PrefetchW(dst + generic::kCachelineSize);
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
- sw_prefetch::PrefetchW(dst + generic::kCachelineSize * 2);
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
if (count <= 192) {
- return Memset<uint256_t>::loop_and_tail(dst, value, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
// Warm up memset
generic::Memset<uint256_t>::block(dst, value);
>From d97c8c0d17c860a892c029059469b70962e4a201 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 20:21:49 +0000
Subject: [PATCH 08/16] Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 4 ++--
libc/src/string/memory_utils/x86_64/inline_memset.h | 4 +---
2 files changed, 3 insertions(+), 5 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index ae221e0fa380655..35f74b544bb3598 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -178,8 +178,8 @@ template <typename T> struct Memset {
while (prefetch_offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
- PrefetchW(dst + prefetch_offset + prefetch_distance +
- sw_prefetch::kCachelineSize * i);
+ sw_prefetch::PrefetchW(dst + prefetch_offset + prefetch_distance +
+ sw_prefetch::kCachelineSize * i);
for (size_t i = 0; i < prefetch_degree;
i += SIZE, prefetch_offset += SIZE)
block(dst + prefetch_offset, value);
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index fc00f86fc0fb34e..50ba2fb2e37cfab 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -86,9 +86,7 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
align_to_next_boundary<32>(dst, count);
- else {
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
- }
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
}
} // namespace LIBC_NAMESPACE
>From 34d572e81b561b4450022dc358f6e3a91632224f Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 20:21:49 +0000
Subject: [PATCH 09/16] Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 18 +++++++++---------
.../string/memory_utils/x86_64/inline_memset.h | 5 +----
2 files changed, 10 insertions(+), 13 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 35f74b544bb3598..f36c3acafff5665 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -171,22 +171,22 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- template <size_t prefetch_distance, size_t prefetch_degree, size_t offset>
+ template <size_t prefetch_distance, size_t prefetch_degree>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- size_t prefetch_offset = offset;
+ size_t offset = 0;
- while (prefetch_offset + prefetch_degree + SIZE <= count) {
+ while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
- sw_prefetch::PrefetchW(dst + prefetch_offset + prefetch_distance +
+ sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
sw_prefetch::kCachelineSize * i);
for (size_t i = 0; i < prefetch_degree;
- i += SIZE, prefetch_offset += SIZE)
- block(dst + prefetch_offset, value);
+ i += SIZE, offset += SIZE)
+ block(dst + offset, value);
}
- while (prefetch_offset + SIZE < count) {
- block(dst + prefetch_offset, value);
- prefetch_offset += SIZE;
+ while (offset + SIZE < count) {
+ block(dst + offset, value);
+ offset += SIZE;
}
tail(dst, value, count);
}
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 50ba2fb2e37cfab..4834968c0b99f38 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -52,10 +52,7 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
- // Warm up memset
- generic::Memset<uint256_t>::block(dst, value);
- generic::Memset<uint128_t>::block(dst + 64, value);
- return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128, 96>(
+ return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
dst, value, count);
}
}
>From f363ce21eedc44821e5163ad9472396856a096c1 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Mon, 30 Oct 2023 20:21:49 +0000
Subject: [PATCH 10/16] Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index f36c3acafff5665..4063de1d5f5832a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -180,8 +180,7 @@ template <typename T> struct Memset {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
sw_prefetch::kCachelineSize * i);
- for (size_t i = 0; i < prefetch_degree;
- i += SIZE, offset += SIZE)
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
block(dst + offset, value);
}
while (offset + SIZE < count) {
>From 1110c4e6237dcae26351d7678ee91de0e7fe3791 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 15:42:32 -0400
Subject: [PATCH 11/16] Sw prefetch in memset (#2)
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
---
libc/src/string/CMakeLists.txt | 1 +
libc/src/string/memory_utils/op_generic.h | 26 +++++++
.../memory_utils/x86_64/inline_memset.h | 75 ++++++++++++-------
.../llvm-project-overlay/libc/BUILD.bazel | 1 +
4 files changed, 78 insertions(+), 25 deletions(-)
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 67675b682081c67..aa69bff7a8cfada 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -656,6 +656,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memset(memset_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
add_memset(memset_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memset(memset_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
+ add_memset(memset_x86_64_opt_sw_prefetch COMPILE_OPTIONS -DLIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING)
add_memset(memset_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memset(memset)
elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64})
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index fd71ca30e24b936..4063de1d5f5832a 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,6 +48,13 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
+namespace LIBC_NAMESPACE::sw_prefetch {
+// Size of a cacheline for software prefetching
+static constexpr size_t kCachelineSize = 64;
+// prefetch for write
+static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+} // namespace LIBC_NAMESPACE::sw_prefetch
+
namespace LIBC_NAMESPACE::generic {
// We accept three types of values as elements for generic operations:
@@ -163,6 +170,25 @@ template <typename T> struct Memset {
} while (offset < count - SIZE);
tail(dst, value, count);
}
+
+ template <size_t prefetch_distance, size_t prefetch_degree>
+ LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
+ size_t count) {
+ size_t offset = 0;
+
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
+ sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
+ sw_prefetch::kCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+ block(dst + offset, value);
+ }
+ while (offset + SIZE < count) {
+ block(dst + offset, value);
+ offset += SIZE;
+ }
+ tail(dst, value, count);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 6436594856b0eaf..4834968c0b99f38 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -16,9 +16,12 @@
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
+namespace x86 {
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+ LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
#if defined(__AVX512F__)
using uint128_t = generic_v128;
using uint256_t = generic_v256;
@@ -37,29 +40,51 @@ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
using uint512_t = cpp::array<uint64_t, 8>;
#endif
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
-}
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
+ dst, value, count);
+ }
+ }
+
+ [[maybe_unused]] LIBC_INLINE static void
+ inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset) {
+ return inline_memset_x86_sw_prefetching(dst, value, count);
+ }
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ }
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 3ae68193dccd2b2..dea21fd77182605 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -33,6 +33,7 @@ PRINTF_COPTS = [
MEMORY_COPTS = [
# "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0",
# "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING",
+ # "LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING",
]
# A flag to pick which `mpfr` to use for math tests.
>From 6c96e79b76f2a09f908af1fc323b7986e871ceec Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 31 Oct 2023 20:35:35 +0000
Subject: [PATCH 12/16] Add warmup to memset
---
libc/src/string/memory_utils/op_generic.h | 3 +--
libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4063de1d5f5832a..2844501a7459044 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -174,8 +174,7 @@ template <typename T> struct Memset {
template <size_t prefetch_distance, size_t prefetch_degree>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- size_t offset = 0;
-
+ size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 4834968c0b99f38..98f559bca875a3a 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ // Prefetch one cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the next cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
@@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
dst, value, count);
}
>From 01be692503dc4e913b00b5d074b2cbb4c63347f6 Mon Sep 17 00:00:00 2001
From: doshimili <milidoshi at google.com>
Date: Tue, 31 Oct 2023 17:05:57 -0400
Subject: [PATCH 13/16] Add software prefetch instructions to memset
* Add software prefetching to memset
* Add software prefetching to memset
* Fix formatting
* Fix build errors
* Fix build errors
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Fix formatting
* Add warmup to memset
---
libc/src/string/memory_utils/op_generic.h | 3 +--
libc/src/string/memory_utils/x86_64/inline_memset.h | 4 ++++
2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 4063de1d5f5832a..2844501a7459044 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -174,8 +174,7 @@ template <typename T> struct Memset {
template <size_t prefetch_distance, size_t prefetch_degree>
LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
size_t count) {
- size_t offset = 0;
-
+ size_t offset = 96;
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 4834968c0b99f38..98f559bca875a3a 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -42,9 +42,11 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
[[maybe_unused]] LIBC_INLINE static void
inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ // Prefetch one cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the next cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
@@ -52,6 +54,8 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
if (count <= 192) {
return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
} else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
dst, value, count);
}
>From 2f3f80163438cd663eed98b63fd0b704a38315b8 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 31 Oct 2023 21:10:17 +0000
Subject: [PATCH 14/16] SW Prefetching in Memset
---
libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 98f559bca875a3a..e4eadf614adc6bf 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -46,7 +46,7 @@ LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
if (count <= 128)
return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Prefetch the next cacheline
+ // Prefetch the second cacheline
sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
// Aligned loop
generic::Memset<uint256_t>::block(dst, value);
>From 24467d08dd39a286629e6fb4bcc3c8d0fede2a41 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 7 Nov 2023 16:25:15 +0000
Subject: [PATCH 15/16] Move implementation to
src/string/memory_utils/x86_64/inline_memset.h and other minor changes
---
libc/src/string/memory_utils/op_generic.h | 43 ++----
libc/src/string/memory_utils/utils.h | 6 +
.../memory_utils/x86_64/inline_memcpy.h | 5 -
.../memory_utils/x86_64/inline_memset.h | 131 ++++++++++--------
4 files changed, 92 insertions(+), 93 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 2844501a7459044..2ee1a650ba71879 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -48,13 +48,6 @@ using generic_v256 = uint8_t __attribute__((__vector_size__(32)));
using generic_v512 = uint8_t __attribute__((__vector_size__(64)));
} // namespace LIBC_NAMESPACE
-namespace LIBC_NAMESPACE::sw_prefetch {
-// Size of a cacheline for software prefetching
-static constexpr size_t kCachelineSize = 64;
-// prefetch for write
-static inline void PrefetchW(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
-} // namespace LIBC_NAMESPACE::sw_prefetch
-
namespace LIBC_NAMESPACE::generic {
// We accept three types of values as elements for generic operations:
@@ -141,19 +134,23 @@ template <typename T> struct Memset {
static_assert(is_element_type_v<T>);
static constexpr size_t SIZE = sizeof(T);
- LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+ LIBC_INLINE static void block_offset(Ptr dst, uint8_t value, size_t offset) {
if constexpr (is_scalar_v<T> || is_vector_v<T>) {
- store<T>(dst, splat<T>(value));
+ store<T>(dst + offset, splat<T>(value));
} else if constexpr (is_array_v<T>) {
using value_type = typename T::value_type;
const auto Splat = splat<value_type>(value);
for (size_t I = 0; I < array_size_v<T>; ++I)
- store<value_type>(dst + (I * sizeof(value_type)), Splat);
+ store<value_type>(dst + offset + (I * sizeof(value_type)), Splat);
}
}
+ LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+ block_offset(dst, value, 0);
+ }
+
LIBC_INLINE static void tail(Ptr dst, uint8_t value, size_t count) {
- block(dst + count - SIZE, value);
+ block_offset(dst, value, count - SIZE);
}
LIBC_INLINE static void head_tail(Ptr dst, uint8_t value, size_t count) {
@@ -161,33 +158,19 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, size_t count, size_t offset) {
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
- size_t offset = 0;
do {
- block(dst + offset, value);
+ block_offset(dst, value, offset);
offset += SIZE;
} while (offset < count - SIZE);
tail(dst, value, count);
}
- template <size_t prefetch_distance, size_t prefetch_degree>
- LIBC_INLINE static void loop_and_tail_prefetch(Ptr dst, uint8_t value,
- size_t count) {
- size_t offset = 96;
- while (offset + prefetch_degree + SIZE <= count) {
- for (size_t i = 0; i < prefetch_degree / sw_prefetch::kCachelineSize; ++i)
- sw_prefetch::PrefetchW(dst + offset + prefetch_distance +
- sw_prefetch::kCachelineSize * i);
- for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
- block(dst + offset, value);
- }
- while (offset + SIZE < count) {
- block(dst + offset, value);
- offset += SIZE;
+ LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ return loop_and_tail_offset(dst, value, count, 0);
}
- tail(dst, value, count);
- }
+
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 85677e51fad0e09..62b3b7a0d728bd5 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -374,6 +374,12 @@ template <size_t SIZE> struct AlignHelper {
uintptr_t offset_;
};
+LIBC_INLINE void prefetch_for_write(CPtr dst) { __builtin_prefetch(dst, 1, 3); }
+
+LIBC_INLINE void prefetch_to_local_cache(CPtr dst) {
+ __builtin_prefetch(dst, 0, 3);
+}
+
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_UTILS_H
diff --git a/libc/src/string/memory_utils/x86_64/inline_memcpy.h b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
index f43230ffd8ad125..f851bcec09650d3 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memcpy.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memcpy.h
@@ -47,11 +47,6 @@ LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold =
} // namespace x86
-// TODO: Move to a shared header when appropriate.
-[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) {
- __builtin_prefetch(addr, 0, 3);
-}
-
[[maybe_unused]] LIBC_INLINE void
inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src,
size_t count) {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index e4eadf614adc6bf..2f132b45789b5c9 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -12,83 +12,98 @@
#include "src/string/memory_utils/op_generic.h"
#include "src/string/memory_utils/op_x86.h"
#include "src/string/memory_utils/utils.h" // Ptr, CPtr
+#include "third_party/llvm/llvm-project/libc/src/string/memory_utils/inline_memcpy.h"
#include <stddef.h> // size_t
namespace LIBC_NAMESPACE {
namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+
LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
} // namespace x86
#if defined(__AVX512F__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
#elif defined(__AVX__)
- using uint128_t = generic_v128;
- using uint256_t = generic_v256;
- using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
#elif defined(__SSE2__)
- using uint128_t = generic_v128;
- using uint256_t = cpp::array<generic_v128, 2>;
- using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
#else
- using uint128_t = cpp::array<uint64_t, 2>;
- using uint256_t = cpp::array<uint64_t, 4>;
- using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
#endif
- [[maybe_unused]] LIBC_INLINE static void
- inline_memset_x86_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
- // Prefetch one cacheline
- sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize);
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Prefetch the second cacheline
- sw_prefetch::PrefetchW(dst + sw_prefetch::kCachelineSize * 2);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- if (count <= 192) {
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
- } else {
- generic::Memset<uint512_t>::block(dst, value);
- generic::Memset<uint256_t>::block(dst + sizeof(uint512_t), value);
- return generic::Memset<uint256_t>::loop_and_tail_prefetch<320, 128>(
- dst, value, count);
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+ size_t prefetch_distance = x86::kFiveCachelinesSize;
+ size_t prefetch_degree = x86::kTwoCachelinesSize;
+ size_t SIZE = sizeof(uint256_t);
+ // Prefetch one cache line
+ prefetch_for_write(dst + x86::kOneCachelineSize);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Prefetch the second cache line
+ prefetch_for_write(dst + x86::kTwoCachelinesSize);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ if (count <= 192) {
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+ } else {
+ generic::Memset<uint512_t>::block(dst, value);
+ generic::Memset<uint256_t>::block_offset(dst, value, SIZE);
+ size_t offset = 96;
+ while (offset + prefetch_degree + SIZE <= count) {
+ for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
+ prefetch_for_write(dst + offset + prefetch_distance +
+ x86::kOneCachelineSize * i);
+ for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
+ generic::Memset<uint256_t>::block_offset(dst, value, offset);
}
+ generic::Memset<uint256_t>::loop_and_tail_offset(dst, value, count, offset);
}
+}
- [[maybe_unused]] LIBC_INLINE static void
- inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memset<uint8_t>::block(dst, value);
- if (count == 2)
- return generic::Memset<uint16_t>::block(dst, value);
- if (count == 3)
- return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
- if (count <= 8)
- return generic::Memset<uint32_t>::head_tail(dst, value, count);
- if (count <= 16)
- return generic::Memset<uint64_t>::head_tail(dst, value, count);
- if (count <= 32)
- return generic::Memset<uint128_t>::head_tail(dst, value, count);
- if (count <= 64)
- return generic::Memset<uint256_t>::head_tail(dst, value, count);
- if constexpr (x86::kUseSoftwarePrefetchingMemset) {
- return inline_memset_x86_sw_prefetching(dst, value, count);
- }
- if (count <= 128)
- return generic::Memset<uint512_t>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<uint256_t>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
- }
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ if (count == 0)
+ return;
+ if (count == 1)
+ return generic::Memset<uint8_t>::block(dst, value);
+ if (count == 2)
+ return generic::Memset<uint16_t>::block(dst, value);
+ if (count == 3)
+ return generic::MemsetSequence<uint16_t, uint8_t>::block(dst, value);
+ if (count <= 8)
+ return generic::Memset<uint32_t>::head_tail(dst, value, count);
+ if (count <= 16)
+ return generic::Memset<uint64_t>::head_tail(dst, value, count);
+ if (count <= 32)
+ return generic::Memset<uint128_t>::head_tail(dst, value, count);
+ if (count <= 64)
+ return generic::Memset<uint256_t>::head_tail(dst, value, count);
+ if constexpr (x86::kUseSoftwarePrefetchingMemset)
+ return inline_memset_x86_gt64_sw_prefetching(dst, value, count);
+ if (count <= 128)
+ return generic::Memset<uint512_t>::head_tail(dst, value, count);
+ // Aligned loop
+ generic::Memset<uint256_t>::block(dst, value);
+ align_to_next_boundary<32>(dst, count);
+ return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+}
} // namespace LIBC_NAMESPACE
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_X86_64_INLINE_MEMSET_H
>From e86bcb7440c6a157907169dcc6fe25f0b322ef89 Mon Sep 17 00:00:00 2001
From: Your Name <milidoshi at google.com>
Date: Tue, 7 Nov 2023 16:27:51 +0000
Subject: [PATCH 16/16] Fix formatting
---
libc/src/string/memory_utils/op_generic.h | 10 +++++-----
libc/src/string/memory_utils/x86_64/inline_memset.h | 2 +-
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 2ee1a650ba71879..b508aca6e846bb4 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -158,7 +158,8 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value, size_t count, size_t offset) {
+ LIBC_INLINE static void loop_and_tail_offset(Ptr dst, uint8_t value,
+ size_t count, size_t offset) {
static_assert(SIZE > 1, "a loop of size 1 does not need tail");
do {
block_offset(dst, value, offset);
@@ -167,10 +168,9 @@ template <typename T> struct Memset {
tail(dst, value, count);
}
- LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
- return loop_and_tail_offset(dst, value, count, 0);
- }
-
+ LIBC_INLINE static void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+ return loop_and_tail_offset(dst, value, count, 0);
+ }
};
template <typename T, typename... TS> struct MemsetSequence {
diff --git a/libc/src/string/memory_utils/x86_64/inline_memset.h b/libc/src/string/memory_utils/x86_64/inline_memset.h
index 2f132b45789b5c9..b745b8a6b7b1bcb 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memset.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memset.h
@@ -69,7 +69,7 @@ inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
while (offset + prefetch_degree + SIZE <= count) {
for (size_t i = 0; i < prefetch_degree / x86::kOneCachelineSize; ++i)
prefetch_for_write(dst + offset + prefetch_distance +
- x86::kOneCachelineSize * i);
+ x86::kOneCachelineSize * i);
for (size_t i = 0; i < prefetch_degree; i += SIZE, offset += SIZE)
generic::Memset<uint256_t>::block_offset(dst, value, offset);
}
More information about the flang-commits
mailing list