[llvm] [libc] [libc] Adding a version of memset with software prefetching (PR #70857)

Wed Nov 8 06:55:23 PST 2023

================
@@ -16,27 +16,68 @@
 #include <stddef.h> // size_t
 
 namespace LIBC_NAMESPACE {
+namespace x86 {
+// Size of one cache line for software prefetching
+LIBC_INLINE_VAR constexpr size_t kOneCachelineSize = 64;
+LIBC_INLINE_VAR constexpr size_t kTwoCachelinesSize = kOneCachelineSize * 2;
+LIBC_INLINE_VAR constexpr size_t kFiveCachelinesSize = kOneCachelineSize * 5;
+
+LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetchingMemset =
+    LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMSET_X86_USE_SOFTWARE_PREFETCHING);
+
+} // namespace x86
 
-[[maybe_unused]] LIBC_INLINE static void
-inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
 #if defined(__AVX512F__)
-  using uint128_t = generic_v128;
-  using uint256_t = generic_v256;
-  using uint512_t = generic_v512;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = generic_v512;
 #elif defined(__AVX__)
-  using uint128_t = generic_v128;
-  using uint256_t = generic_v256;
-  using uint512_t = cpp::array<generic_v256, 2>;
+using uint128_t = generic_v128;
+using uint256_t = generic_v256;
+using uint512_t = cpp::array<generic_v256, 2>;
 #elif defined(__SSE2__)
-  using uint128_t = generic_v128;
-  using uint256_t = cpp::array<generic_v128, 2>;
-  using uint512_t = cpp::array<generic_v128, 4>;
+using uint128_t = generic_v128;
+using uint256_t = cpp::array<generic_v128, 2>;
+using uint512_t = cpp::array<generic_v128, 4>;
 #else
-  using uint128_t = cpp::array<uint64_t, 2>;
-  using uint256_t = cpp::array<uint64_t, 4>;
-  using uint512_t = cpp::array<uint64_t, 8>;
+using uint128_t = cpp::array<uint64_t, 2>;
+using uint256_t = cpp::array<uint64_t, 4>;
+using uint512_t = cpp::array<uint64_t, 8>;
 #endif
 
+[[maybe_unused]] LIBC_INLINE static void
+inline_memset_x86_gt64_sw_prefetching(Ptr dst, uint8_t value, size_t count) {
+  size_t prefetch_distance = x86::kFiveCachelinesSize;
+  size_t prefetch_degree = x86::kTwoCachelinesSize;
+  size_t SIZE = sizeof(uint256_t);
+  // Prefetch one cache line
+  prefetch_for_write(dst + x86::kOneCachelineSize);
+  if (count <= 128)
+    return generic::Memset<uint512_t>::head_tail(dst, value, count);
+  // Prefetch the second cache line
+  prefetch_for_write(dst + x86::kTwoCachelinesSize);
+  // Aligned loop
+  generic::Memset<uint256_t>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  if (count <= 192) {
+    return generic::Memset<uint256_t>::loop_and_tail(dst, value, count);
+  } else {
+    generic::Memset<uint512_t>::block(dst, value);
+    generic::Memset<uint256_t>::block_offset(dst, value, sizeof(uint512_t));
----------------
gchatelet wrote:

These two lines would be ` generic::MemsetSequence<uint512_t, uint256_t>::block(dst, value);`

This (maybe?) lowers the utility of introducing `block_offset` in `Memset` but no strong opinion here.

https://github.com/llvm/llvm-project/pull/70857