[libc-commits] [libc] b6d3ae3 - Revert D136595 "[libc] Switch to new implementation of mem* functions"

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Thu Oct 27 01:38:59 PDT 2022


Author: Guillaume Chatelet
Date: 2022-10-27T08:38:46Z
New Revision: b6d3ae3d3de0ce8015e422592aa467b99d4b0b6d

URL: https://github.com/llvm/llvm-project/commit/b6d3ae3d3de0ce8015e422592aa467b99d4b0b6d
DIFF: https://github.com/llvm/llvm-project/commit/b6d3ae3d3de0ce8015e422592aa467b99d4b0b6d.diff

LOG: Revert D136595 "[libc] Switch to new implementation of mem* functions"

This patch seems to introduce bugs on aarch64.
Reverting while we investigate the root cause.

This reverts commit 02841488138160f9064f334a833d4bf3e80385c6.

Added: 
    

Modified: 
    libc/src/stdio/printf_core/string_writer.cpp
    libc/src/string/bcmp.cpp
    libc/src/string/memcmp.cpp
    libc/src/string/memcpy.cpp
    libc/src/string/memmove.cpp
    libc/src/string/memory_utils/bcmp_implementations.h
    libc/src/string/memory_utils/bzero_implementations.h
    libc/src/string/memory_utils/memcmp_implementations.h
    libc/src/string/memory_utils/memcpy_implementations.h
    libc/src/string/memory_utils/memset_implementations.h
    libc/src/string/memory_utils/op_x86.h
    libc/src/string/mempcpy.cpp
    libc/src/string/memset.cpp

Removed: 
    


################################################################################
diff  --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp
index 472573d4a8137..a80df32d40a02 100644
--- a/libc/src/stdio/printf_core/string_writer.cpp
+++ b/libc/src/stdio/printf_core/string_writer.cpp
@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
     len = available_capacity;
 
   if (len > 0) {
-    inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
+    inline_memset(cur_buffer, new_char, len);
     cur_buffer += len;
     available_capacity -= len;
   }

diff  --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp
index 21991303b1468..963a7f5bce17c 100644
--- a/libc/src/string/bcmp.cpp
+++ b/libc/src/string/bcmp.cpp
@@ -14,7 +14,8 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(int, bcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return inline_bcmp(lhs, rhs, count);
+  return inline_bcmp(static_cast<const char *>(lhs),
+                     static_cast<const char *>(rhs), count);
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp
index 7cf6782dd0d5c..292525e17dad0 100644
--- a/libc/src/string/memcmp.cpp
+++ b/libc/src/string/memcmp.cpp
@@ -15,7 +15,8 @@ namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(int, memcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return inline_memcmp(lhs, rhs, count);
+  return inline_memcmp(static_cast<const char *>(lhs),
+                       static_cast<const char *>(rhs), count);
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
index 850400540037f..ff990f48a20bc 100644
--- a/libc/src/string/memcpy.cpp
+++ b/libc/src/string/memcpy.cpp
@@ -15,7 +15,8 @@ namespace __llvm_libc {
 LLVM_LIBC_FUNCTION(void *, memcpy,
                    (void *__restrict dst, const void *__restrict src,
                     size_t size)) {
-  inline_memcpy(dst, src, size);
+  inline_memcpy(reinterpret_cast<char *>(dst),
+                reinterpret_cast<const char *>(src), size);
   return dst;
 }
 

diff  --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index a42ced3fc36bc..f24257893b20c 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -9,110 +9,42 @@
 #include "src/string/memmove.h"
 
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/__support/integer_operations.h"
+#include "src/string/memory_utils/elements.h"
 #include <stddef.h> // size_t, ptr
diff _t
 
-#include <stdio.h>
-
 namespace __llvm_libc {
 
-[[maybe_unused]] static inline void
-inline_memmove_embedded_tiny(Ptr dst, CPtr src, size_t count) {
-  if ((count == 0) || (dst == src))
-    return;
-  if (dst < src) {
-#pragma nounroll
-    for (size_t offset = 0; offset < count; ++offset)
-      builtin::Memcpy<1>::block(dst + offset, src + offset);
-  } else {
-#pragma nounroll
-    for (ptr
diff _t offset = count - 1; offset >= 0; --offset)
-      builtin::Memcpy<1>::block(dst + offset, src + offset);
-  }
-}
-
-template <size_t MaxSize>
-[[maybe_unused]] static inline void inline_memmove_generic(Ptr dst, CPtr src,
-                                                           size_t count) {
+static inline void inline_memmove(char *dst, const char *src, size_t count) {
+  using namespace __llvm_libc::scalar;
   if (count == 0)
     return;
   if (count == 1)
-    return generic::Memmove<1, MaxSize>::block(dst, src);
+    return move<_1>(dst, src);
   if (count <= 4)
-    return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_2>>(dst, src, count);
   if (count <= 8)
-    return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_4>>(dst, src, count);
   if (count <= 16)
-    return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_8>>(dst, src, count);
   if (count <= 32)
-    return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_16>>(dst, src, count);
   if (count <= 64)
-    return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
+    return move<HeadTail<_32>>(dst, src, count);
   if (count <= 128)
-    return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
-  if (dst < src) {
-    generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
-                                                                    count);
-    return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
-                                                                count);
-  } else {
-    generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
-                                                                     count);
-    return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
-                                                                 count);
-  }
-}
+    return move<HeadTail<_64>>(dst, src, count);
 
-static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-#if defined(LLVM_LIBC_ARCH_X86)
-  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
-                                     : x86::kAvx   ? 32
-                                     : x86::kSse2  ? 16
-                                                   : 8;
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
-#endif
-  // return inline_memmove_generic<kMaxSize>(dst, src, count);
-  if (count == 0)
-    return;
-  if (count == 1)
-    return generic::Memmove<1, kMaxSize>::block(dst, src);
-  if (count <= 4)
-    return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 8)
-    return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 16)
-    return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 32)
-    return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 64)
-    return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
-  if (count <= 128)
-    return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
-  if (dst < src) {
-    generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
-    return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
-                                                                 count);
-  } else {
-    generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
-    return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
-                                                                  count);
-  }
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memmove_embedded_tiny(dst, src, count);
-#else
-#error "Unsupported platform"
-#endif
+  using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
+  if (dst < src)
+    return move<AlignedMoveLoop>(dst, src, count);
+  else if (dst > src)
+    return move_backward<AlignedMoveLoop>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memmove,
                    (void *dst, const void *src, size_t count)) {
-  inline_memmove(reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src),
-                 count);
+  inline_memmove(reinterpret_cast<char *>(dst),
+                 reinterpret_cast<const char *>(src), count);
   return dst;
 }
 

diff  --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h
index 2e18ee81aaf6f..c26e38e51adf1 100644
--- a/libc/src/string/memory_utils/bcmp_implementations.h
+++ b/libc/src/string/memory_utils/bcmp_implementations.h
@@ -11,169 +11,49 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset))
-      return value;
-  return BcmpReturnType::ZERO();
+// Fixed-size 
diff erence between 'lhs' and 'rhs'.
+template <typename Element> bool 
diff ers(const char *lhs, const char *rhs) {
+  return !Element::equals(lhs, rhs);
 }
-
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count < 256)
-    return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
-  if (auto value = generic::Bcmp<64>::block(p1, p2))
-    return value;
-  align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
+// Runtime-size 
diff erence between 'lhs' and 'rhs'.
+template <typename Element>
+bool 
diff ers(const char *lhs, const char *rhs, size_t size) {
+  return !Element::equals(lhs, rhs, size);
 }
-#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
 
+static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
-  if (count < 256)
-    return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
-  if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
-    return value;
-  align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
-  if (count <= 64)
-    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
-  if (count <= 128)
-    return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 256)) {
-    if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
-      return value;
-    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
-  if (count <= 64)
-    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
-  if (count <= 128)
-    return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 256)) {
-    if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
-      return value;
-    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
-                                                              size_t count) {
+  using namespace ::__llvm_libc::x86;
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  using namespace ::__llvm_libc::aarch64;
+#else
+  using namespace ::__llvm_libc::scalar;
+#endif
   if (count == 0)
-    return BcmpReturnType::ZERO();
+    return 0;
   if (count == 1)
-    return generic::Bcmp<1>::block(p1, p2);
+    return 
diff ers<_1>(lhs, rhs);
   if (count == 2)
-    return generic::Bcmp<2>::block(p1, p2);
-  if (count <= 4)
-    return generic::Bcmp<2>::head_tail(p1, p2, count);
+    return 
diff ers<_2>(lhs, rhs);
+  if (count == 3)
+    return 
diff ers<_3>(lhs, rhs);
   if (count <= 8)
-    return generic::Bcmp<4>::head_tail(p1, p2, count);
+    return 
diff ers<HeadTail<_4>>(lhs, rhs, count);
   if (count <= 16)
-    return generic::Bcmp<8>::head_tail(p1, p2, count);
-  if constexpr (x86::kAvx512BW)
-    return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
-  else if constexpr (x86::kAvx2)
-    return inline_bcmp_x86_avx2_gt16(p1, p2, count);
-  else if constexpr (x86::kSse2)
-    return inline_bcmp_x86_sse2_gt16(p1, p2, count);
-  else
-    return inline_bcmp_generic_gt16(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline BcmpReturnType
-inline_bcmp_aarch64(CPtr p1, CPtr p2, size_t count) {
-  if (likely(count <= 32)) {
-    if (unlikely(count >= 16)) {
-      return generic::Bcmp<16>::head_tail(p1, p2, count);
-    }
-    switch (count) {
-    case 0:
-      return BcmpReturnType::ZERO();
-    case 1:
-      return generic::Bcmp<1>::block(p1, p2);
-    case 2:
-      return generic::Bcmp<2>::block(p1, p2);
-    case 3:
-      return generic::Bcmp<2>::head_tail(p1, p2, count);
-    case 4:
-      return generic::Bcmp<4>::block(p1, p2);
-    case 5:
-    case 6:
-    case 7:
-      return generic::Bcmp<4>::head_tail(p1, p2, count);
-    case 8:
-      return generic::Bcmp<8>::block(p1, p2);
-    case 9:
-    case 10:
-    case 11:
-    case 12:
-    case 13:
-    case 14:
-    case 15:
-      return generic::Bcmp<8>::head_tail(p1, p2, count);
-    }
-  }
-
+    return 
diff ers<HeadTail<_8>>(lhs, rhs, count);
+  if (count <= 32)
+    return 
diff ers<HeadTail<_16>>(lhs, rhs, count);
   if (count <= 64)
-    return generic::Bcmp<32>::head_tail(p1, p2, count);
-
-  // Aligned loop if > 256, otherwise normal loop
-  if (count > 256) {
-    if (auto value = generic::Bcmp<32>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  }
-  return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86)
-  return inline_bcmp_x86(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  return inline_bcmp_aarch64(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_bcmp_embedded_tiny(p1, p2, count);
-#else
-#error "Unsupported platform"
-#endif
-}
-
-static inline int inline_bcmp(const void *p1, const void *p2, size_t count) {
-  return static_cast<int>(inline_bcmp(reinterpret_cast<CPtr>(p1),
-                                      reinterpret_cast<CPtr>(p2), count));
+    return 
diff ers<HeadTail<_32>>(lhs, rhs, count);
+  if (count <= 128)
+    return 
diff ers<HeadTail<_64>>(lhs, rhs, count);
+  return 
diff ers<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/string/memory_utils/bzero_implementations.h b/libc/src/string/memory_utils/bzero_implementations.h
index 550c910def885..168fdd7e531d2 100644
--- a/libc/src/string/memory_utils/bzero_implementations.h
+++ b/libc/src/string/memory_utils/bzero_implementations.h
@@ -15,14 +15,10 @@
 
 namespace __llvm_libc {
 
-inline static void inline_bzero(Ptr dst, size_t count) {
+inline static void inline_bzero(char *dst, size_t count) {
   inline_memset(dst, 0, count);
 }
 
-inline static void inline_bzero(void *dst, size_t count) {
-  inline_bzero(reinterpret_cast<Ptr>(dst), count);
-}
-
 } // namespace __llvm_libc
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_BZERO_IMPLEMENTATIONS_H

diff  --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h
index 642c7e9d129bf..f2079468f2be3 100644
--- a/libc/src/string/memory_utils/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/memcmp_implementations.h
@@ -11,141 +11,95 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
-#include "src/string/memory_utils/utils.h"
+#include "src/string/memory_utils/elements.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
-      return value;
-  return MemcmpReturnType::ZERO();
-}
-
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (unlikely(count >= 384)) {
-    if (auto value = generic::Memcmp<16>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  }
-  return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
 
+static inline int inline_memcmp(const char *lhs, const char *rhs,
+                                size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (unlikely(count >= 384)) {
-    if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-  }
-  return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (count <= 32)
-    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
-  if (count <= 64)
-    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
-  if (count <= 128)
-    return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 384)) {
-    if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
-      return value;
-    align_to_next_boundary<32, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
-}
-
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::x86;
+  if (count == 0)
+    return 0;
+  if (count == 1)
+    return three_way_compare<_1>(lhs, rhs);
+  if (count == 2)
+    return three_way_compare<_2>(lhs, rhs);
+  if (count == 3)
+    return three_way_compare<_3>(lhs, rhs);
+  if (count <= 8)
+    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+  if (count <= 16)
+    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
   if (count <= 32)
-    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
+    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
   if (count <= 64)
-    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
+    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
   if (count <= 128)
-    return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
-  if (unlikely(count >= 384)) {
-    if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
-      return value;
-    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
-  }
-  return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline MemcmpReturnType
-inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) {
-  if (unlikely(count >= 128)) { // [128, ∞]
-    if (auto value = generic::Memcmp<16>::block(p1, p2))
-      return value;
-    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
-    return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
-  }
+    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
+  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace ::__llvm_libc::aarch64;
+  if (count == 0) // [0, 0]
+    return 0;
+  if (count == 1) // [1, 1]
+    return three_way_compare<_1>(lhs, rhs);
+  if (count == 2) // [2, 2]
+    return three_way_compare<_2>(lhs, rhs);
+  if (count == 3) // [3, 3]
+    return three_way_compare<_3>(lhs, rhs);
+  if (count < 8) // [4, 7]
+    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+  if (count < 16) // [8, 15]
+    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+  if (unlikely(count >= 128)) // [128, ∞]
+    return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
+  if (!equals<_16>(lhs, rhs)) // [16, 16]
+    return three_way_compare<_16>(lhs, rhs);
   if (count < 32) // [17, 31]
-    return generic::Memcmp<16>::tail(p1, p2, count);
-  if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
-    return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
+    return three_way_compare<Tail<_16>>(lhs, rhs, count);
+  if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
+    return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
   if (count < 64) // [33, 63]
-    return generic::Memcmp<32>::tail(p1, p2, count);
+    return three_way_compare<Tail<_32>>(lhs, rhs, count);
   // [64, 127]
-  return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+  return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
+#else
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace ::__llvm_libc::scalar;
 
-static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
   if (count == 0)
-    return MemcmpReturnType::ZERO();
+    return 0;
   if (count == 1)
-    return generic::Memcmp<1>::block(p1, p2);
+    return three_way_compare<_1>(lhs, rhs);
   if (count == 2)
-    return generic::Memcmp<2>::block(p1, p2);
+    return three_way_compare<_2>(lhs, rhs);
   if (count == 3)
-    return generic::Memcmp<3>::block(p1, p2);
+    return three_way_compare<_3>(lhs, rhs);
   if (count <= 8)
-    return generic::Memcmp<4>::head_tail(p1, p2, count);
+    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
   if (count <= 16)
-    return generic::Memcmp<8>::head_tail(p1, p2, count);
-#if defined(LLVM_LIBC_ARCH_X86)
-  if constexpr (x86::kAvx512BW)
-    return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
-  else if constexpr (x86::kAvx2)
-    return inline_memcmp_x86_avx2_gt16(p1, p2, count);
-  else if constexpr (x86::kSse2)
-    return inline_memcmp_x86_sse2_gt16(p1, p2, count);
-  else
-    return inline_memcmp_generic_gt16(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  if constexpr (aarch64::kNeon)
-    return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
-  else
-    return inline_memcmp_generic_gt16(p1, p2, count);
-#endif
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memcmp_embedded_tiny(p1, p2, count);
-#else
-#error "Unsupported platform"
+    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+  if (count <= 32)
+    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
+  if (count <= 64)
+    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
+  if (count <= 128)
+    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
+  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
 #endif
 }
 
-static inline int inline_memcmp(const void *p1, const void *p2, size_t count) {
-  return static_cast<int>(inline_memcmp(reinterpret_cast<CPtr>(p1),
-                                        reinterpret_cast<CPtr>(p2), count));
-}
-
 } // namespace __llvm_libc
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCMP_IMPLEMENTATIONS_H

diff  --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
index cb9a82856f45f..3385d40fbc56b 100644
--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -11,130 +11,145 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
-namespace __llvm_libc {
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
 
-[[maybe_unused]] static inline void
-inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src,
-                            size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    builtin::Memcpy<1>::block(dst + offset, src + offset);
-}
+namespace __llvm_libc {
 
+static inline void inline_memcpy(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
+  using namespace __llvm_libc::builtin;
 #if defined(LLVM_LIBC_ARCH_X86)
-[[maybe_unused]] static inline void
-inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) {
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+
+  // Whether to use only rep;movsb.
+  constexpr bool USE_ONLY_REP_MOVSB =
+      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
+
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
+  constexpr size_t REP_MOVS_B_SIZE =
+#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
+      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+#else
+      -1;
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
+  // Whether target supports AVX instructions.
+  constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
+
+#if defined(__AVX__)
+  using LoopBlockSize = _64;
+#else
+  using LoopBlockSize = _32;
+#endif
+
+  if (USE_ONLY_REP_MOVSB)
+    return copy<x86::Accelerator>(dst, src, count);
+
   if (count == 0)
     return;
   if (count == 1)
-    return builtin::Memcpy<1>::block(dst, src);
+    return copy<_1>(dst, src);
   if (count == 2)
-    return builtin::Memcpy<2>::block(dst, src);
+    return copy<_2>(dst, src);
   if (count == 3)
-    return builtin::Memcpy<3>::block(dst, src);
+    return copy<_3>(dst, src);
   if (count == 4)
-    return builtin::Memcpy<4>::block(dst, src);
+    return copy<_4>(dst, src);
   if (count < 8)
-    return builtin::Memcpy<4>::head_tail(dst, src, count);
+    return copy<HeadTail<_4>>(dst, src, count);
   if (count < 16)
-    return builtin::Memcpy<8>::head_tail(dst, src, count);
+    return copy<HeadTail<_8>>(dst, src, count);
   if (count < 32)
-    return builtin::Memcpy<16>::head_tail(dst, src, count);
+    return copy<HeadTail<_16>>(dst, src, count);
   if (count < 64)
-    return builtin::Memcpy<32>::head_tail(dst, src, count);
+    return copy<HeadTail<_32>>(dst, src, count);
   if (count < 128)
-    return builtin::Memcpy<64>::head_tail(dst, src, count);
-  if (x86::kAvx && count < 256)
-    return builtin::Memcpy<128>::head_tail(dst, src, count);
-  builtin::Memcpy<32>::block(dst, src);
-  align_to_next_boundary<32, Arg::Dst>(dst, src, count);
-  static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32;
-  return builtin::Memcpy<kBlockSize>::loop_and_tail(dst, src, count);
-}
-
-[[maybe_unused]] static inline void
-inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst,
-                                           CPtr __restrict src, size_t count) {
-  // Whether to use rep;movsb exclusively, not at all, or only above a certain
-  // threshold.
-  // TODO: Use only a single preprocessor definition to simplify the code.
-#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
-#endif
-
-  static constexpr bool kUseOnlyRepMovsb =
-      LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
-  static constexpr size_t kRepMovsbThreshold =
-      LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
-  if constexpr (kUseOnlyRepMovsb)
-    return x86::Memcpy::repmovsb(dst, src, count);
-  else if constexpr (kRepMovsbThreshold >= 0) {
-    if (unlikely(count >= kRepMovsbThreshold))
-      return x86::Memcpy::repmovsb(dst, src, count);
-    else
-      return inline_memcpy_x86(dst, src, count);
-  } else {
-    return inline_memcpy_x86(dst, src, count);
-  }
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-[[maybe_unused]] static inline void
-inline_memcpy_aarch64(Ptr __restrict dst, CPtr __restrict src, size_t count) {
+    return copy<HeadTail<_64>>(dst, src, count);
+  if (HAS_AVX && count < 256)
+    return copy<HeadTail<_128>>(dst, src, count);
+  if (count <= REP_MOVS_B_SIZE)
+    return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
+                                                                 count);
+  return copy<x86::Accelerator>(dst, src, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
   if (count == 0)
     return;
   if (count == 1)
-    return builtin::Memcpy<1>::block(dst, src);
+    return copy<_1>(dst, src);
   if (count == 2)
-    return builtin::Memcpy<2>::block(dst, src);
+    return copy<_2>(dst, src);
   if (count == 3)
-    return builtin::Memcpy<3>::block(dst, src);
+    return copy<_3>(dst, src);
   if (count == 4)
-    return builtin::Memcpy<4>::block(dst, src);
+    return copy<_4>(dst, src);
   if (count < 8)
-    return builtin::Memcpy<4>::head_tail(dst, src, count);
+    return copy<HeadTail<_4>>(dst, src, count);
   if (count < 16)
-    return builtin::Memcpy<8>::head_tail(dst, src, count);
+    return copy<HeadTail<_8>>(dst, src, count);
   if (count < 32)
-    return builtin::Memcpy<16>::head_tail(dst, src, count);
+    return copy<HeadTail<_16>>(dst, src, count);
   if (count < 64)
-    return builtin::Memcpy<32>::head_tail(dst, src, count);
+    return copy<HeadTail<_32>>(dst, src, count);
   if (count < 128)
-    return builtin::Memcpy<64>::head_tail(dst, src, count);
-  builtin::Memcpy<16>::block(dst, src);
-  align_to_next_boundary<16, Arg::Src>(dst, src, count);
-  return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-static inline void inline_memcpy(Ptr __restrict dst, CPtr __restrict src,
-                                 size_t count) {
-  using namespace __llvm_libc::builtin;
-#if defined(LLVM_LIBC_ARCH_X86)
-  return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  return inline_memcpy_aarch64(dst, src, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memcpy_embedded_tiny(dst, src, count);
+    return copy<HeadTail<_64>>(dst, src, count);
+  return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
 #else
-#error "Unsupported platform"
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  if (count == 0)
+    return;
+  if (count == 1)
+    return copy<_1>(dst, src);
+  if (count == 2)
+    return copy<_2>(dst, src);
+  if (count == 3)
+    return copy<_3>(dst, src);
+  if (count == 4)
+    return copy<_4>(dst, src);
+  if (count < 8)
+    return copy<HeadTail<_4>>(dst, src, count);
+  if (count < 16)
+    return copy<HeadTail<_8>>(dst, src, count);
+  if (count < 32)
+    return copy<HeadTail<_16>>(dst, src, count);
+  if (count < 64)
+    return copy<HeadTail<_32>>(dst, src, count);
+  if (count < 128)
+    return copy<HeadTail<_64>>(dst, src, count);
+  return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
 #endif
 }
 
-static inline void inline_memcpy(void *__restrict dst,
-                                 const void *__restrict src, size_t count) {
-  inline_memcpy(reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src), count);
-}
-
 } // namespace __llvm_libc
 
 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_IMPLEMENTATIONS_H

diff  --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h
index 75ecf164a2b0f..d58ed3b703306 100644
--- a/libc/src/string/memory_utils/memset_implementations.h
+++ b/libc/src/string/memory_utils/memset_implementations.h
@@ -10,109 +10,127 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
 
 #include "src/__support/architectures.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-[[maybe_unused]] inline static void
-inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) {
-#pragma nounroll
-  for (size_t offset = 0; offset < count; ++offset)
-    generic::Memset<1, 1>::block(dst + offset, value);
-}
-
+// A general purpose implementation assuming cheap unaligned writes for sizes:
+// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
+// or 64 Bytes at a time, the compiler will expand them as needed.
+//
+// This implementation is subject to change as we benchmark more processors. We
+// may also want to customize it for processors with specialized instructions
+// that performs better (e.g. `rep stosb`).
+//
+// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
+// We want to balance two things here:
+//  - The number of redundant writes (when using `SetBlockOverlap`),
+//  - The number of conditionals for sizes <=128 (~90% of memset calls are for
+//    such sizes).
+//
+// For the range 64-128:
+//  - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
+//  is wasteful near 65 but efficient toward 128.
+//  - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
+//  96 or 128 Bytes.
+//  - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
+//  for 65-96 and copy<96>+Overlap<32> for 97-128
+//
+// Benchmarks showed that redundant writes were cheap (for Intel X86) but
+// conditional were expensive, even on processor that do not support writing 64B
+// at a time (pre-AVX512F). We also want to favor short functions that allow
+// more hot code to fit in the iL1 cache.
+//
+// Above 128 we have to use conditionals since we don't know the upper bound in
+// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
+// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
+// superior for sizes that mattered.
+inline static void inline_memset(char *dst, unsigned char value, size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-template <size_t MaxSize>
-[[maybe_unused]] inline static void inline_memset_x86(Ptr dst, uint8_t value,
-                                                      size_t count) {
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_X86
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::x86;
   if (count == 0)
     return;
   if (count == 1)
-    return generic::Memset<1, MaxSize>::block(dst, value);
+    return splat_set<_1>(dst, value);
   if (count == 2)
-    return generic::Memset<2, MaxSize>::block(dst, value);
+    return splat_set<_2>(dst, value);
   if (count == 3)
-    return generic::Memset<3, MaxSize>::block(dst, value);
+    return splat_set<_3>(dst, value);
   if (count <= 8)
-    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_4>>(dst, value, count);
   if (count <= 16)
-    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_8>>(dst, value, count);
   if (count <= 32)
-    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_16>>(dst, value, count);
   if (count <= 64)
-    return generic::Memset<32, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_32>>(dst, value, count);
   if (count <= 128)
-    return generic::Memset<64, MaxSize>::head_tail(dst, value, count);
-  // Aligned loop
-  generic::Memset<32, MaxSize>::block(dst, value);
-  align_to_next_boundary<32>(dst, count);
-  return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-template <size_t MaxSize>
-[[maybe_unused]] inline static void
-inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
+    return splat_set<HeadTail<_64>>(dst, value, count);
+  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace __llvm_libc::aarch64_memset;
   if (count == 0)
     return;
   if (count <= 3) {
-    generic::Memset<1, MaxSize>::block(dst, value);
+    splat_set<_1>(dst, value);
     if (count > 1)
-      generic::Memset<2, MaxSize>::tail(dst, value, count);
+      splat_set<Tail<_2>>(dst, value, count);
     return;
   }
   if (count <= 8)
-    return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_4>>(dst, value, count);
   if (count <= 16)
-    return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_8>>(dst, value, count);
   if (count <= 32)
-    return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+    return splat_set<HeadTail<_16>>(dst, value, count);
   if (count <= (32 + 64)) {
-    generic::Memset<32, MaxSize>::block(dst, value);
+    splat_set<_32>(dst, value);
     if (count <= 64)
-      return generic::Memset<32, MaxSize>::tail(dst, value, count);
-    generic::Memset<32, MaxSize>::block(dst + 32, value);
-    generic::Memset<32, MaxSize>::tail(dst, value, count);
+      return splat_set<Tail<_32>>(dst, value, count);
+    splat_set<Skip<32>::Then<_32>>(dst, value);
+    splat_set<Tail<_32>>(dst, value, count);
     return;
   }
-  if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
-    generic::Memset<64, MaxSize>::block(dst, 0);
-    align_to_next_boundary<64>(dst, count);
-    return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
-  } else {
-    generic::Memset<16, MaxSize>::block(dst, value);
-    align_to_next_boundary<16>(dst, count);
-    return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count);
-  }
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86)
-  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
-                                     : x86::kAvx   ? 32
-                                     : x86::kSse2  ? 16
-                                                   : 8;
-  return inline_memset_x86<kMaxSize>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
-  return inline_memset_aarch64<kMaxSize>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
-  return inline_memset_embedded_tiny(dst, value, count);
+  if (count >= 448 && value == 0 && hasZva())
+    return splat_set<Align<_64, Arg::P1>::Then<Loop<Zva64, _64>>>(dst, 0,
+                                                                  count);
+  else
+    return splat_set<Align<_16, Arg::P1>::Then<Loop<_64>>>(dst, value, count);
 #else
-#error "Unsupported platform"
-#endif
-}
+  /////////////////////////////////////////////////////////////////////////////
+  // Default
+  /////////////////////////////////////////////////////////////////////////////
+  using namespace ::__llvm_libc::scalar;
 
-inline static void inline_memset(void *dst, uint8_t value, size_t count) {
-  inline_memset(reinterpret_cast<Ptr>(dst), value, count);
+  if (count == 0)
+    return;
+  if (count == 1)
+    return splat_set<_1>(dst, value);
+  if (count == 2)
+    return splat_set<_2>(dst, value);
+  if (count == 3)
+    return splat_set<_3>(dst, value);
+  if (count <= 8)
+    return splat_set<HeadTail<_4>>(dst, value, count);
+  if (count <= 16)
+    return splat_set<HeadTail<_8>>(dst, value, count);
+  if (count <= 32)
+    return splat_set<HeadTail<_16>>(dst, value, count);
+  if (count <= 64)
+    return splat_set<HeadTail<_32>>(dst, value, count);
+  if (count <= 128)
+    return splat_set<HeadTail<_64>>(dst, value, count);
+  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+#endif
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 8e6432233ca31..a4b59a12b0b76 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -42,7 +42,7 @@ static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
 ///////////////////////////////////////////////////////////////////////////////
 // Memcpy repmovsb implementation
 struct Memcpy {
-  static void repmovsb(void *dst, const void *src, size_t count) {
+  static void repmovsb(char *dst, const char *src, size_t count) {
     asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
   }
 };

diff  --git a/libc/src/string/mempcpy.cpp b/libc/src/string/mempcpy.cpp
index dd539eb3a2d87..f26bd64bee42a 100644
--- a/libc/src/string/mempcpy.cpp
+++ b/libc/src/string/mempcpy.cpp
@@ -15,10 +15,11 @@
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void *, mempcpy,
-                   (void *__restrict dst, const void *__restrict src,
+                   (void *__restrict dest, const void *__restrict src,
                     size_t count)) {
-  inline_memcpy(dst, src, count);
-  return reinterpret_cast<char *>(dst) + count;
+  char *result = reinterpret_cast<char *>(dest);
+  inline_memcpy(result, reinterpret_cast<const char *>(src), count);
+  return result + count;
 }
 
 } // namespace __llvm_libc

diff  --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp
index b80cfce87fcac..549c0742dec75 100644
--- a/libc/src/string/memset.cpp
+++ b/libc/src/string/memset.cpp
@@ -13,7 +13,8 @@
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
-  inline_memset(dst, static_cast<uint8_t>(value), count);
+  inline_memset(reinterpret_cast<char *>(dst),
+                static_cast<unsigned char>(value), count);
   return dst;
 }
 


        


More information about the libc-commits mailing list