[libc-commits] [libc] 7fff39d - [libc] Add a set of elementary operations

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Wed Jun 16 05:11:57 PDT 2021


Author: Guillaume Chatelet
Date: 2021-06-16T12:11:45Z
New Revision: 7fff39d9b0469b75e589f9d47055a33422376da1

URL: https://github.com/llvm/llvm-project/commit/7fff39d9b0469b75e589f9d47055a33422376da1
DIFF: https://github.com/llvm/llvm-project/commit/7fff39d9b0469b75e589f9d47055a33422376da1.diff

LOG: [libc] Add a set of elementary operations

Resubmission of D100646 now making sure that we handle cases were `__builtin_memcpy_inline` is not available.

Original commit message:
Each of these elementary operations can be assembled to support higher order constructs (Overlapping access, Loop, Aligned Loop).
The patch does not compile yet as it depends on other ones (D100571, D100631) but it allows to get the conversation started.

A self-contained version of this code is available at https://godbolt.org/z/e1x6xdaxM

Added: 
    libc/src/string/memory_utils/elements.h
    libc/src/string/memory_utils/elements_x86.h
    libc/test/src/string/memory_utils/elements_test.cpp
    libc/test/src/string/memory_utils/memory_access_test.cpp

Modified: 
    libc/src/string/CMakeLists.txt
    libc/src/string/aarch64/memcpy.cpp
    libc/src/string/memcpy.cpp
    libc/src/string/memory_utils/CMakeLists.txt
    libc/src/string/memory_utils/memset_utils.h
    libc/src/string/x86_64/memcpy.cpp
    libc/test/src/string/memory_utils/CMakeLists.txt

Removed: 
    libc/src/string/memory_utils/memcpy_utils.h
    libc/test/src/string/memory_utils/memcpy_utils_test.cpp


################################################################################
diff  --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index f7a0406127143..4dd8ee0bf4a2c 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -194,7 +194,7 @@ function(add_implementation name impl_name)
     SRCS ${ADD_IMPL_SRCS}
     HDRS ${ADD_IMPL_HDRS}
     DEPENDS ${ADD_IMPL_DEPENDS}
-    COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS}
+    COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} "SHELL:-mllvm -combiner-global-alias-analysis"
   )
   get_fq_target_name(${impl_name} fq_target_name)
   set_target_properties(${fq_target_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_IMPL_REQUIRE}")

diff  --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp
index 78988ec37afe6..1a1fbbc026a73 100644
--- a/libc/src/string/aarch64/memcpy.cpp
+++ b/libc/src/string/aarch64/memcpy.cpp
@@ -8,10 +8,19 @@
 
 #include "src/string/memcpy.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/memcpy_utils.h"
+#include "src/string/memory_utils/elements.h"
 
 namespace __llvm_libc {
 
+using _1 = scalar::UINT8;
+using _2 = scalar::UINT16;
+using _3 = Chained<scalar::UINT16, scalar::UINT8>;
+using _4 = scalar::UINT32;
+using _8 = scalar::UINT64;
+using _16 = Repeated<scalar::UINT64, 2>;
+using _32 = Repeated<scalar::UINT64, 4>;
+using _64 = Repeated<scalar::UINT64, 8>;
+
 // Design rationale
 // ================
 //
@@ -37,24 +46,24 @@ static void memcpy_aarch64(char *__restrict dst, const char *__restrict src,
   if (count == 0)
     return;
   if (count == 1)
-    return CopyBlock<1>(dst, src);
+    return Copy<_1>(dst, src);
   if (count == 2)
-    return CopyBlock<2>(dst, src);
+    return Copy<_2>(dst, src);
   if (count == 3)
-    return CopyBlock<3>(dst, src);
+    return Copy<_3>(dst, src);
   if (count == 4)
-    return CopyBlock<4>(dst, src);
+    return Copy<_4>(dst, src);
   if (count < 8)
-    return CopyBlockOverlap<4>(dst, src, count);
+    return Copy<HeadTail<_4>>(dst, src, count);
   if (count < 16)
-    return CopyBlockOverlap<8>(dst, src, count);
+    return Copy<HeadTail<_8>>(dst, src, count);
   if (count < 32)
-    return CopyBlockOverlap<16>(dst, src, count);
+    return Copy<HeadTail<_16>>(dst, src, count);
   if (count < 64)
-    return CopyBlockOverlap<32>(dst, src, count);
+    return Copy<HeadTail<_32>>(dst, src, count);
   if (count < 128)
-    return CopyBlockOverlap<64>(dst, src, count);
-  return CopySrcAlignedBlocks<64, 16>(dst, src, count);
+    return Copy<HeadTail<_64>>(dst, src, count);
+  return Copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,

diff  --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
index e050d7f4ffa2d..5e70e00db1b91 100644
--- a/libc/src/string/memcpy.cpp
+++ b/libc/src/string/memcpy.cpp
@@ -8,7 +8,7 @@
 
 #include "src/string/memcpy.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/memcpy_utils.h"
+#include "src/string/memory_utils/elements.h"
 
 namespace __llvm_libc {
 
@@ -32,27 +32,30 @@ namespace __llvm_libc {
 //   with little change on the code side.
 static void memcpy_impl(char *__restrict dst, const char *__restrict src,
                         size_t count) {
+  // Use scalar strategies (_1, _2, _3 ...)
+  using namespace __llvm_libc::scalar;
+
   if (count == 0)
     return;
   if (count == 1)
-    return CopyBlock<1>(dst, src);
+    return Copy<_1>(dst, src);
   if (count == 2)
-    return CopyBlock<2>(dst, src);
+    return Copy<_2>(dst, src);
   if (count == 3)
-    return CopyBlock<3>(dst, src);
+    return Copy<_3>(dst, src);
   if (count == 4)
-    return CopyBlock<4>(dst, src);
+    return Copy<_4>(dst, src);
   if (count < 8)
-    return CopyBlockOverlap<4>(dst, src, count);
+    return Copy<HeadTail<_4>>(dst, src, count);
   if (count < 16)
-    return CopyBlockOverlap<8>(dst, src, count);
+    return Copy<HeadTail<_8>>(dst, src, count);
   if (count < 32)
-    return CopyBlockOverlap<16>(dst, src, count);
+    return Copy<HeadTail<_16>>(dst, src, count);
   if (count < 64)
-    return CopyBlockOverlap<32>(dst, src, count);
+    return Copy<HeadTail<_32>>(dst, src, count);
   if (count < 128)
-    return CopyBlockOverlap<64>(dst, src, count);
-  return CopySrcAlignedBlocks<32>(dst, src, count);
+    return Copy<HeadTail<_64>>(dst, src, count);
+  return Copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,

diff  --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 327031ae67f8d..4a550e544dac8 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -2,6 +2,5 @@ add_header_library(
   memory_utils
   HDRS
     utils.h
-    memcpy_utils.h
-    memset_utils.h
+    elements.h
 )

diff  --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
new file mode 100644
index 0000000000000..2442da760217e
--- /dev/null
+++ b/libc/src/string/memory_utils/elements.h
@@ -0,0 +1,499 @@
+//===-- Elementary operations to compose memory primitives ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
+
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#include "src/__support/endian.h"
+#include "src/string/memory_utils/utils.h"
+
+namespace __llvm_libc {
+
+// Elementary Operations
+// --------------------------------
+// We define abstract elementary operations acting on fixed chunks of memory.
+// These are low level building blocks that are meant to be assembled to compose
+// higher order abstractions. Each function is defined twice: once with
+// fixed-size operations, and once with runtime-size operations.
+
+// Fixed-size copies from 'src' to 'dst'.
+template <typename Element>
+void Copy(char *__restrict dst, const char *__restrict src) {
+  Element::Copy(dst, src);
+}
+// Runtime-size copies from 'src' to 'dst'.
+template <typename Element>
+void Copy(char *__restrict dst, const char *__restrict src, size_t size) {
+  Element::Copy(dst, src, size);
+}
+
+// Fixed-size equality between 'lhs' and 'rhs'.
+template <typename Element> bool Equals(const char *lhs, const char *rhs) {
+  return Element::Equals(lhs, rhs);
+}
+// Runtime-size equality between 'lhs' and 'rhs'.
+template <typename Element>
+bool Equals(const char *lhs, const char *rhs, size_t size) {
+  return Element::Equals(lhs, rhs, size);
+}
+
+// Fixed-size three-way comparison between 'lhs' and 'rhs'.
+template <typename Element>
+int ThreeWayCompare(const char *lhs, const char *rhs) {
+  return Element::ThreeWayCompare(lhs, rhs);
+}
+// Runtime-size three-way comparison between 'lhs' and 'rhs'.
+template <typename Element>
+int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
+  return Element::ThreeWayCompare(lhs, rhs, size);
+}
+
+// Fixed-size initialization.
+template <typename Element>
+void SplatSet(char *dst, const unsigned char value) {
+  Element::SplatSet(dst, value);
+}
+// Runtime-size initialization.
+template <typename Element>
+void SplatSet(char *dst, const unsigned char value, size_t size) {
+  Element::SplatSet(dst, value, size);
+}
+
+// Fixed-size Higher-Order Operations
+// ----------------------------------
+// - Repeated<Type, ElementCount>: Repeat the operation several times in a row.
+// - Chained<Types...>: Chain the operation of several types.
+
+// Repeat the operation several times in a row.
+template <typename Element, size_t ElementCount> struct Repeated {
+  static constexpr size_t kSize = ElementCount * Element::kSize;
+
+  static void Copy(char *__restrict dst, const char *__restrict src) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::kSize;
+      Element::Copy(dst + offset, src + offset);
+    }
+  }
+
+  static bool Equals(const char *lhs, const char *rhs) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::kSize;
+      if (!Element::Equals(lhs + offset, rhs + offset))
+        return false;
+    }
+    return true;
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::kSize;
+      // We make the assumption that 'Equals' si cheaper than 'ThreeWayCompare'.
+      if (Element::Equals(lhs + offset, rhs + offset))
+        continue;
+      return Element::ThreeWayCompare(lhs + offset, rhs + offset);
+    }
+    return 0;
+  }
+
+  static void SplatSet(char *dst, const unsigned char value) {
+    for (size_t i = 0; i < ElementCount; ++i) {
+      const size_t offset = i * Element::kSize;
+      Element::SplatSet(dst + offset, value);
+    }
+  }
+};
+
+// Chain the operation of several types.
+// For instance, to handle a 3 bytes operation, one can use:
+// Chained<UINT16, UINT8>::Operation();
+template <typename... Types> struct Chained;
+
+template <typename Head, typename... Tail> struct Chained<Head, Tail...> {
+  static constexpr size_t kSize = Head::kSize + Chained<Tail...>::kSize;
+
+  static void Copy(char *__restrict dst, const char *__restrict src) {
+    Chained<Tail...>::Copy(dst + Head::kSize, src + Head::kSize);
+    __llvm_libc::Copy<Head>(dst, src);
+  }
+
+  static bool Equals(const char *lhs, const char *rhs) {
+    if (!__llvm_libc::Equals<Head>(lhs, rhs))
+      return false;
+    return Chained<Tail...>::Equals(lhs + Head::kSize, rhs + Head::kSize);
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs) {
+    if (__llvm_libc::Equals<Head>(lhs, rhs))
+      return Chained<Tail...>::ThreeWayCompare(lhs + Head::kSize,
+                                               rhs + Head::kSize);
+    return __llvm_libc::ThreeWayCompare<Head>(lhs, rhs);
+  }
+
+  static void SplatSet(char *dst, const unsigned char value) {
+    Chained<Tail...>::SplatSet(dst + Head::kSize, value);
+    __llvm_libc::SplatSet<Head>(dst, value);
+  }
+};
+
+template <> struct Chained<> {
+  static constexpr size_t kSize = 0;
+  static void Copy(char *__restrict dst, const char *__restrict src) {}
+  static bool Equals(const char *lhs, const char *rhs) { return true; }
+  static int ThreeWayCompare(const char *lhs, const char *rhs) { return 0; }
+  static void SplatSet(char *dst, const unsigned char value) {}
+};
+
+// Runtime-size Higher-Order Operations
+// ------------------------------------
+// - Tail<T>: Perform the operation on the last 'T::kSize' bytes of the buffer.
+// - HeadTail<T>: Perform the operation on the first and last 'T::kSize' bytes
+//   of the buffer.
+// - Loop<T>: Perform a loop of fixed-sized operations.
+
+// Perform the operation on the last 'T::kSize' bytes of the buffer.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [________XXXXXXXX___]
+//
+// Precondition: `size >= T::kSize`.
+template <typename T> struct Tail {
+  static void Copy(char *__restrict dst, const char *__restrict src,
+                   size_t size) {
+    return T::Copy(dst + offset(size), src + offset(size));
+  }
+
+  static bool Equals(const char *lhs, const char *rhs, size_t size) {
+    return T::Equals(lhs + offset(size), rhs + offset(size));
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
+    return T::ThreeWayCompare(lhs + offset(size), rhs + offset(size));
+  }
+
+  static void SplatSet(char *dst, const unsigned char value, size_t size) {
+    return T::SplatSet(dst + offset(size), value);
+  }
+
+  static size_t offset(size_t size) { return size - T::kSize; }
+};
+
+// Perform the operation on the first and last 'T::kSize' bytes of the buffer.
+// This is useful for overlapping operations.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `size >= T::kSize && size <= 2 x T::kSize`.
+template <typename T> struct HeadTail {
+  static void Copy(char *__restrict dst, const char *__restrict src,
+                   size_t size) {
+    T::Copy(dst, src);
+    Tail<T>::Copy(dst, src, size);
+  }
+
+  static bool Equals(const char *lhs, const char *rhs, size_t size) {
+    if (!T::Equals(lhs, rhs))
+      return false;
+    return Tail<T>::Equals(lhs, rhs, size);
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
+    if (const int result = T::ThreeWayCompare(lhs, rhs))
+      return result;
+    return Tail<T>::ThreeWayCompare(lhs, rhs, size);
+  }
+
+  static void SplatSet(char *dst, const unsigned char value, size_t size) {
+    T::SplatSet(dst, value);
+    Tail<T>::SplatSet(dst, value, size);
+  }
+};
+
+// Simple loop ending with a Tail operation.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX_______________________]
+// [__________XXXXXXXX_______________]
+// [__________________XXXXXXXX_______]
+// [______________________XXXXXXXX___]
+//
+// Precondition:
+// - size >= T::kSize
+template <typename T> struct Loop {
+  static void Copy(char *__restrict dst, const char *__restrict src,
+                   size_t size) {
+    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+      T::Copy(dst + offset, src + offset);
+    Tail<T>::Copy(dst, src, size);
+  }
+
+  static bool Equals(const char *lhs, const char *rhs, size_t size) {
+    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+      if (!T::Equals(lhs + offset, rhs + offset))
+        return false;
+    return Tail<T>::Equals(lhs, rhs, size);
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
+    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+      if (const int result = T::ThreeWayCompare(lhs + offset, rhs + offset))
+        return result;
+    return Tail<T>::ThreeWayCompare(lhs, rhs, size);
+  }
+
+  static void SplatSet(char *dst, const unsigned char value, size_t size) {
+    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
+      T::SplatSet(dst + offset, value);
+    Tail<T>::SplatSet(dst, value, size);
+  }
+};
+
+enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
+
+namespace internal {
+
+// Provides a specialized Bump function that adjusts pointers and size so first
+// argument (resp. second argument) gets aligned to Alignment.
+// We make sure the compiler knows about the adjusted pointer alignment.
+template <Arg arg, size_t Alignment> struct AlignHelper {};
+
+template <size_t Alignment> struct AlignHelper<Arg::_1, Alignment> {
+  template <typename T1, typename T2>
+  static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) {
+    const intptr_t offset = offset_to_next_aligned<Alignment>(p1ref);
+    p1ref += offset;
+    p2ref += offset;
+    size -= offset;
+    p1ref = assume_aligned<Alignment>(p1ref);
+  }
+};
+
+template <size_t Alignment> struct AlignHelper<Arg::_2, Alignment> {
+  template <typename T1, typename T2>
+  static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) {
+    const intptr_t offset = offset_to_next_aligned<Alignment>(p2ref);
+    p1ref += offset;
+    p2ref += offset;
+    size -= offset;
+    p2ref = assume_aligned<Alignment>(p2ref);
+  }
+};
+
+} // namespace internal
+
+// An alignment operation that:
+// - executes the 'AlignmentT' operation
+// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected
+//   pointer gets aligned, size is decreased accordingly.
+// - calls the 'NextT' operation.
+//
+// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
+// Copy<Align<_16, Arg::Dst>::Then<Loop<_32>>>(dst, src, count);
+template <typename AlignmentT, Arg AlignOn> struct Align {
+private:
+  static constexpr size_t Alignment = AlignmentT::kSize;
+  static_assert(Alignment > 1, "Alignment must be more than 1");
+  static_assert(is_power2(Alignment), "Alignment must be a power of 2");
+
+public:
+  template <typename NextT> struct Then {
+    static void Copy(char *__restrict dst, const char *__restrict src,
+                     size_t size) {
+      AlignmentT::Copy(dst, src);
+      internal::AlignHelper<AlignOn, Alignment>::Bump(dst, src, size);
+      NextT::Copy(dst, src, size);
+    }
+
+    static bool Equals(const char *lhs, const char *rhs, size_t size) {
+      if (!AlignmentT::Equals(lhs, rhs))
+        return false;
+      internal::AlignHelper<AlignOn, Alignment>::Bump(lhs, rhs, size);
+      return NextT::Equals(lhs, rhs, size);
+    }
+
+    static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
+      if (const int result = AlignmentT::ThreeWayCompare(lhs, rhs))
+        return result;
+      internal::AlignHelper<AlignOn, Alignment>::Bump(lhs, rhs, size);
+      return NextT::ThreeWayCompare(lhs, rhs, size);
+    }
+
+    static void SplatSet(char *dst, const unsigned char value, size_t size) {
+      AlignmentT::SplatSet(dst, value);
+      char *dummy = nullptr;
+      internal::AlignHelper<Arg::_1, Alignment>::Bump(dst, dummy, size);
+      NextT::SplatSet(dst, value, size);
+    }
+  };
+};
+
+// Fixed-size Builtin Operations
+// -----------------------------
+// Note: Do not use 'builtin' right now as it requires the implementation of the
+// `_inline` versions of all the builtins. Theoretically, Clang can still turn
+// them into calls to the C library leading to reentrancy problems.
+namespace builtin {
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
+#endif
+
+template <size_t Size> struct Builtin {
+  static constexpr size_t kSize = Size;
+
+  static void Copy(char *__restrict dst, const char *__restrict src) {
+#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
+    ForLoopCopy(dst, src);
+#elif __has_builtin(__builtin_memcpy_inline)
+    // __builtin_memcpy_inline guarantees to never call external functions.
+    // Unfortunately it is not widely available.
+    __builtin_memcpy_inline(dst, src, kSize);
+#elif __has_builtin(__builtin_memcpy)
+    __builtin_memcpy(dst, src, kSize);
+#else
+    ForLoopCopy(dst, src);
+#endif
+  }
+
+  static bool Equals(const char *lhs, const char *rhs) {
+    return __builtin_memcmp(lhs, rhs, kSize) == 0;
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs) {
+    return __builtin_memcmp(lhs, rhs, kSize);
+  }
+
+  static void SplatSet(char *dst, const unsigned char value) {
+    __builtin_memset(dst, value, kSize);
+  }
+
+private:
+  // Copies `kSize` bytes from `src` to `dst` using a for loop.
+  // This code requires the use of `-fno-buitin-memcpy` to prevent the compiler
+  // from turning the for-loop back into `__builtin_memcpy`.
+  static void ForLoopCopy(char *__restrict dst, const char *__restrict src) {
+    for (size_t i = 0; i < kSize; ++i)
+      dst[i] = src[i];
+  }
+};
+
+using _1 = Builtin<1>;
+using _2 = Builtin<2>;
+using _3 = Builtin<3>;
+using _4 = Builtin<4>;
+using _8 = Builtin<8>;
+using _16 = Builtin<16>;
+using _32 = Builtin<32>;
+using _64 = Builtin<64>;
+using _128 = Builtin<128>;
+
+} // namespace builtin
+
+// Fixed-size Scalar Operations
+// ----------------------------
+namespace scalar {
+
+// The Scalar type makes use of simple sized integers.
+template <typename T> struct Scalar {
+  static constexpr size_t kSize = sizeof(T);
+
+  static void Copy(char *__restrict dst, const char *__restrict src) {
+    Store(dst, Load(src));
+  }
+
+  static bool Equals(const char *lhs, const char *rhs) {
+    return Load(lhs) == Load(rhs);
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs) {
+    return ScalarThreeWayCompare(Load(lhs), Load(rhs));
+  }
+
+  static void SplatSet(char *dst, const unsigned char value) {
+    Store(dst, GetSplattedValue(value));
+  }
+
+private:
+  static T Load(const char *ptr) {
+    T value;
+    builtin::Builtin<kSize>::Copy(reinterpret_cast<char *>(&value), ptr);
+    return value;
+  }
+  static void Store(char *ptr, T value) {
+    builtin::Builtin<kSize>::Copy(ptr, reinterpret_cast<const char *>(&value));
+  }
+  static T GetSplattedValue(const unsigned char value) {
+    return T(~0) / T(0xFF) * T(value);
+  }
+  static int ScalarThreeWayCompare(T a, T b);
+};
+
+template <>
+inline int Scalar<uint8_t>::ScalarThreeWayCompare(uint8_t a, uint8_t b) {
+  const int16_t la = Endian::ToBigEndian(a);
+  const int16_t lb = Endian::ToBigEndian(b);
+  return la - lb;
+}
+template <>
+inline int Scalar<uint16_t>::ScalarThreeWayCompare(uint16_t a, uint16_t b) {
+  const int32_t la = Endian::ToBigEndian(a);
+  const int32_t lb = Endian::ToBigEndian(b);
+  return la - lb;
+}
+template <>
+inline int Scalar<uint32_t>::ScalarThreeWayCompare(uint32_t a, uint32_t b) {
+  const int64_t la = Endian::ToBigEndian(a);
+  const int64_t lb = Endian::ToBigEndian(b);
+  if (la < lb)
+    return -1;
+  if (la > lb)
+    return 1;
+  return 0;
+}
+template <>
+inline int Scalar<uint64_t>::ScalarThreeWayCompare(uint64_t a, uint64_t b) {
+  const __int128_t la = Endian::ToBigEndian(a);
+  const __int128_t lb = Endian::ToBigEndian(b);
+  if (la < lb)
+    return -1;
+  if (la > lb)
+    return 1;
+  return 0;
+}
+
+using UINT8 = Scalar<uint8_t>;   // 1 Byte
+using UINT16 = Scalar<uint16_t>; // 2 Bytes
+using UINT32 = Scalar<uint32_t>; // 4 Bytes
+using UINT64 = Scalar<uint64_t>; // 8 Bytes
+
+using _1 = UINT8;
+using _2 = UINT16;
+using _3 = Chained<UINT16, UINT8>;
+using _4 = UINT32;
+using _8 = UINT64;
+using _16 = Repeated<_8, 2>;
+using _32 = Repeated<_8, 4>;
+using _64 = Repeated<_8, 8>;
+using _128 = Repeated<_8, 16>;
+
+} // namespace scalar
+} // namespace __llvm_libc
+
+#include <src/string/memory_utils/elements_x86.h>
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H

diff  --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h
new file mode 100644
index 0000000000000..36f40be4f6809
--- /dev/null
+++ b/libc/src/string/memory_utils/elements_x86.h
@@ -0,0 +1,166 @@
+//===-- Elementary operations for x86 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
+
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif //  __SSE2__
+
+#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar
+
+// Fixed-size Vector Operations
+// ----------------------------
+
+namespace __llvm_libc {
+namespace x86 {
+
+#ifdef __SSE2__
+template <typename Base> struct Vector : public Base {
+  static void Copy(char *dst, const char *src) {
+    Base::Store(dst, Base::Load(src));
+  }
+
+  static bool Equals(const char *a, const char *b) {
+    return Base::NotEqualMask(Base::Load(a), Base::Load(b)) == 0;
+  }
+
+  static int ThreeWayCompare(const char *a, const char *b) {
+    const auto mask = Base::NotEqualMask(Base::Load(a), Base::Load(b));
+    if (!mask)
+      return 0;
+    return CharDiff(a, b, mask);
+  }
+
+  static void SplatSet(char *dst, const unsigned char value) {
+    Base::Store(dst, Base::GetSplattedValue(value));
+  }
+
+  static int CharDiff(const char *a, const char *b, uint64_t mask) {
+    const size_t 
diff _index = __builtin_ctzl(mask);
+    const int ca = (unsigned char)a[
diff _index];
+    const int cb = (unsigned char)b[
diff _index];
+    return ca - cb;
+  }
+};
+
+struct M128 {
+  static constexpr size_t kSize = 16;
+  using T = char __attribute__((__vector_size__(kSize)));
+  static uint16_t mask(T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm_movemask_epi8(value);
+  }
+  static uint16_t NotEqualMask(T a, T b) { return mask(a != b); }
+  static T Load(const char *ptr) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr));
+  }
+  static void Store(char *ptr, T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr), value);
+  }
+  static T GetSplattedValue(const char v) {
+    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+    return splatted;
+  }
+};
+
+using Vector128 = Vector<M128>; // 16 Bytes
+
+#ifdef __AVX2__
+struct M256 {
+  static constexpr size_t kSize = 32;
+  using T = char __attribute__((__vector_size__(kSize)));
+  static uint32_t mask(T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm256_movemask_epi8(value);
+  }
+  static uint32_t NotEqualMask(T a, T b) { return mask(a != b); }
+  static T Load(const char *ptr) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
+  }
+  static void Store(char *ptr, T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), value);
+  }
+  static T GetSplattedValue(const char v) {
+    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+    return splatted;
+  }
+};
+
+using Vector256 = Vector<M256>; // 32 Bytes
+
+#if defined(__AVX512F__) and defined(__AVX512BW__)
+struct M512 {
+  static constexpr size_t kSize = 64;
+  using T = char __attribute__((__vector_size__(kSize)));
+  static uint64_t NotEqualMask(T a, T b) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm512_cmpneq_epi8_mask(a, b);
+  }
+  static T Load(const char *ptr) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm512_loadu_epi8(ptr);
+  }
+  static void Store(char *ptr, T value) {
+    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+    return _mm512_storeu_epi8(ptr, value);
+  }
+  static T GetSplattedValue(const char v) {
+    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+    return splatted;
+  }
+};
+using Vector512 = Vector<M512>;
+
+#endif // defined(__AVX512F__) and defined(__AVX512BW__)
+#endif // __AVX2__
+#endif // __SSE2__
+
+using _1 = __llvm_libc::scalar::_1;
+using _2 = __llvm_libc::scalar::_2;
+using _3 = __llvm_libc::scalar::_3;
+using _4 = __llvm_libc::scalar::_4;
+using _8 = __llvm_libc::scalar::_8;
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::x86::Vector256;
+using _64 = __llvm_libc::x86::Vector512;
+using _128 = __llvm_libc::Repeated<_64, 2>;
+#elif defined(__AVX2__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::x86::Vector256;
+using _64 = __llvm_libc::Repeated<_32, 2>;
+using _128 = __llvm_libc::Repeated<_32, 4>;
+#elif defined(__SSE2__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::Repeated<_16, 2>;
+using _64 = __llvm_libc::Repeated<_16, 4>;
+using _128 = __llvm_libc::Repeated<_16, 8>;
+#else
+using _16 = __llvm_libc::Repeated<_8, 2>;
+using _32 = __llvm_libc::Repeated<_8, 4>;
+using _64 = __llvm_libc::Repeated<_8, 8>;
+using _128 = __llvm_libc::Repeated<_8, 16>;
+#endif
+
+} // namespace x86
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H

diff  --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
deleted file mode 100644
index 23836bbde3427..0000000000000
--- a/libc/src/string/memory_utils/memcpy_utils.h
+++ /dev/null
@@ -1,140 +0,0 @@
-//===-- Memcpy utils --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H
-#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H
-
-#include "src/__support/sanitizer.h"
-#include "src/string/memory_utils/utils.h"
-#include <stddef.h> // size_t
-
-// __builtin_memcpy_inline guarantees to never call external functions.
-// Unfortunately it is not widely available.
-#ifdef __clang__
-#if __has_builtin(__builtin_memcpy_inline)
-#define USE_BUILTIN_MEMCPY_INLINE
-#endif
-#elif defined(__GNUC__)
-#define USE_BUILTIN_MEMCPY
-#endif
-
-namespace __llvm_libc {
-
-// This is useful for testing.
-#if defined(LLVM_LIBC_MEMCPY_MONITOR)
-extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
-                                         const char *__restrict, size_t);
-#endif
-
-// Copies `kBlockSize` bytes from `src` to `dst` using a for loop.
-// This code requires the use of `-fno-buitin-memcpy` to prevent the compiler
-// from turning the for-loop back into `__builtin_memcpy`.
-template <size_t kBlockSize>
-static void ForLoopCopy(char *__restrict dst, const char *__restrict src) {
-  for (size_t i = 0; i < kBlockSize; ++i)
-    dst[i] = src[i];
-}
-
-// Copies `kBlockSize` bytes from `src` to `dst`.
-template <size_t kBlockSize>
-static void CopyBlock(char *__restrict dst, const char *__restrict src) {
-#if defined(LLVM_LIBC_MEMCPY_MONITOR)
-  LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
-#elif LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
-  ForLoopCopy<kBlockSize>(dst, src);
-#elif defined(USE_BUILTIN_MEMCPY_INLINE)
-  __builtin_memcpy_inline(dst, src, kBlockSize);
-#elif defined(USE_BUILTIN_MEMCPY)
-  __builtin_memcpy(dst, src, kBlockSize);
-#else
-  ForLoopCopy<kBlockSize>(dst, src);
-#endif
-}
-
-// Copies `kBlockSize` bytes from `src + count - kBlockSize` to
-// `dst + count - kBlockSize`.
-// Precondition: `count >= kBlockSize`.
-template <size_t kBlockSize>
-static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
-                          size_t count) {
-  const size_t offset = count - kBlockSize;
-  CopyBlock<kBlockSize>(dst + offset, src + offset);
-}
-
-// Copies `kBlockSize` bytes twice with an overlap between the two.
-//
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [__XXXXXXXX_________]
-// [________XXXXXXXX___]
-//
-// Precondition: `count >= kBlockSize && count <= kBlockSize`.
-template <size_t kBlockSize>
-static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src,
-                             size_t count) {
-  CopyBlock<kBlockSize>(dst, src);
-  CopyLastBlock<kBlockSize>(dst, src, count);
-}
-
-// Copies `count` bytes by blocks of `kBlockSize` bytes.
-// Copies at the start and end of the buffer are unaligned.
-// Copies in the middle of the buffer are aligned to `kAlignment`.
-//
-// e.g. with
-// [12345678123456781234567812345678]
-// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-// [__XXXX___________________________]
-// [_____XXXXXXXX____________________]
-// [_____________XXXXXXXX____________]
-// [_____________________XXXXXXXX____]
-// [______________________XXXXXXXX___]
-//
-// Precondition: `kAlignment <= kBlockSize`
-//               `count > 2 * kBlockSize` for efficiency.
-//               `count >= kAlignment` for correctness.
-template <size_t kBlockSize, size_t kAlignment = kBlockSize>
-static void CopySrcAlignedBlocks(char *__restrict dst,
-                                 const char *__restrict src, size_t count) {
-  static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
-  static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
-  static_assert(kAlignment <= kBlockSize,
-                "kAlignment must be less or equal to block size");
-  CopyBlock<kAlignment>(dst, src); // Copy first block
-
-  // Copy aligned blocks
-  const size_t ofla = offset_from_last_aligned<kAlignment>(src);
-  const size_t limit = count + ofla - kBlockSize;
-  for (size_t offset = kAlignment; offset < limit; offset += kBlockSize)
-    CopyBlock<kBlockSize>(dst - ofla + offset,
-                          assume_aligned<kAlignment>(src - ofla + offset));
-
-  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
-}
-
-template <size_t kBlockSize, size_t kAlignment = kBlockSize>
-static void CopyDstAlignedBlocks(char *__restrict dst,
-                                 const char *__restrict src, size_t count) {
-  static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
-  static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
-  static_assert(kAlignment <= kBlockSize,
-                "kAlignment must be less or equal to block size");
-  CopyBlock<kAlignment>(dst, src); // Copy first block
-
-  // Copy aligned blocks
-  const size_t ofla = offset_from_last_aligned<kAlignment>(dst);
-  const size_t limit = count + ofla - kBlockSize;
-  for (size_t offset = kAlignment; offset < limit; offset += kBlockSize)
-    CopyBlock<kBlockSize>(assume_aligned<kAlignment>(dst - ofla + offset),
-                          src - ofla + offset);
-
-  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
-}
-
-} // namespace __llvm_libc
-
-#endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H

diff  --git a/libc/src/string/memory_utils/memset_utils.h b/libc/src/string/memory_utils/memset_utils.h
index 7024a6c71868b..4826670444f80 100644
--- a/libc/src/string/memory_utils/memset_utils.h
+++ b/libc/src/string/memory_utils/memset_utils.h
@@ -6,70 +6,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
-#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
 
+#include "src/string/memory_utils/elements.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-// Sets `kBlockSize` bytes starting from `src` to `value`.
-template <size_t kBlockSize> static void SetBlock(char *dst, unsigned value) {
-  // Theoretically the compiler is allowed to call memset here and end up with a
-  // recursive call, practically it doesn't happen, however this should be
-  // replaced with a __builtin_memset_inline once it's available in clang.
-  __builtin_memset(dst, value, kBlockSize);
-}
-
-// Sets `kBlockSize` bytes from `src + count - kBlockSize` to `value`.
-// Precondition: `count >= kBlockSize`.
-template <size_t kBlockSize>
-static void SetLastBlock(char *dst, unsigned value, size_t count) {
-  SetBlock<kBlockSize>(dst + count - kBlockSize, value);
-}
-
-// Sets `kBlockSize` bytes twice with an overlap between the two.
-//
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [__XXXXXXXX_________]
-// [________XXXXXXXX___]
-//
-// Precondition: `count >= kBlockSize && count <= kBlockSize`.
-template <size_t kBlockSize>
-static void SetBlockOverlap(char *dst, unsigned value, size_t count) {
-  SetBlock<kBlockSize>(dst, value);
-  SetLastBlock<kBlockSize>(dst, value, count);
-}
-
-// Sets `count` bytes by blocks of `kBlockSize` bytes.
-// Sets at the start and end of the buffer are unaligned.
-// Sets in the middle of the buffer are aligned to `kBlockSize`.
-//
-// e.g. with
-// [12345678123456781234567812345678]
-// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-// [__XXXXXXXX______________________]
-// [________XXXXXXXX________________]
-// [________________XXXXXXXX________]
-// [_____________________XXXXXXXX___]
-//
-// Precondition: `count > 2 * kBlockSize` for efficiency.
-//               `count >= kBlockSize` for correctness.
-template <size_t kBlockSize>
-static void SetAlignedBlocks(char *dst, unsigned value, size_t count) {
-  SetBlock<kBlockSize>(dst, value); // Set first block
-
-  // Set aligned blocks
-  size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
-  for (; offset + kBlockSize < count; offset += kBlockSize)
-    SetBlock<kBlockSize>(dst + offset, value);
-
-  SetLastBlock<kBlockSize>(dst, value, count); // Set last block
-}
-
 // A general purpose implementation assuming cheap unaligned writes for sizes:
 // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
 // or 64 Bytes at a time, the compiler will expand them as needed.
@@ -106,26 +52,27 @@ inline static void GeneralPurposeMemset(char *dst, unsigned char value,
   if (count == 0)
     return;
   if (count == 1)
-    return SetBlock<1>(dst, value);
+    return SplatSet<scalar::_1>(dst, value);
   if (count == 2)
-    return SetBlock<2>(dst, value);
+    return SplatSet<scalar::_2>(dst, value);
   if (count == 3)
-    return SetBlock<3>(dst, value);
+    return SplatSet<scalar::_3>(dst, value);
   if (count == 4)
-    return SetBlock<4>(dst, value);
+    return SplatSet<scalar::_4>(dst, value);
   if (count <= 8)
-    return SetBlockOverlap<4>(dst, value, count);
+    return SplatSet<HeadTail<scalar::_4>>(dst, value, count);
   if (count <= 16)
-    return SetBlockOverlap<8>(dst, value, count);
+    return SplatSet<HeadTail<scalar::_8>>(dst, value, count);
   if (count <= 32)
-    return SetBlockOverlap<16>(dst, value, count);
+    return SplatSet<HeadTail<scalar::_16>>(dst, value, count);
   if (count <= 64)
-    return SetBlockOverlap<32>(dst, value, count);
+    return SplatSet<HeadTail<scalar::_32>>(dst, value, count);
   if (count <= 128)
-    return SetBlockOverlap<64>(dst, value, count);
-  return SetAlignedBlocks<32>(dst, value, count);
+    return SplatSet<HeadTail<scalar::_64>>(dst, value, count);
+  return SplatSet<Align<scalar::_32, Arg::Dst>::Then<Loop<scalar::_32>>>(
+      dst, value, count);
 }
 
 } // namespace __llvm_libc
 
-#endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H

diff  --git a/libc/src/string/x86_64/memcpy.cpp b/libc/src/string/x86_64/memcpy.cpp
index bbd8fe9ebf7a5..7f6e5b64b3a74 100644
--- a/libc/src/string/x86_64/memcpy.cpp
+++ b/libc/src/string/x86_64/memcpy.cpp
@@ -8,7 +8,7 @@
 
 #include "src/string/memcpy.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/memcpy_utils.h"
+#include "src/string/memory_utils/elements.h"
 
 namespace __llvm_libc {
 
@@ -29,8 +29,11 @@ constexpr size_t kRepMovsBSize =
 // Whether target supports AVX instructions.
 constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
 
-// The chunk size used for the loop copy strategy.
-constexpr size_t kLoopCopyBlockSize = kHasAvx ? 64 : 32;
+#ifdef __AVX__
+using LoopBlockSize = __llvm_libc::x86::_64;
+#else
+using LoopBlockSize = __llvm_libc::x86::_32;
+#endif
 
 static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
                          size_t count) {
@@ -61,33 +64,37 @@ static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
 //   with little change on the code side.
 static void memcpy_x86(char *__restrict dst, const char *__restrict src,
                        size_t count) {
+  // Use x86 strategies (_1, _2, _3 ...)
+  using namespace __llvm_libc::x86;
+
   if (kUseOnlyRepMovsb)
     return CopyRepMovsb(dst, src, count);
 
   if (count == 0)
     return;
   if (count == 1)
-    return CopyBlock<1>(dst, src);
+    return Copy<_1>(dst, src);
   if (count == 2)
-    return CopyBlock<2>(dst, src);
+    return Copy<_2>(dst, src);
   if (count == 3)
-    return CopyBlock<3>(dst, src);
+    return Copy<_3>(dst, src);
   if (count == 4)
-    return CopyBlock<4>(dst, src);
+    return Copy<_4>(dst, src);
   if (count < 8)
-    return CopyBlockOverlap<4>(dst, src, count);
+    return Copy<HeadTail<_4>>(dst, src, count);
   if (count < 16)
-    return CopyBlockOverlap<8>(dst, src, count);
+    return Copy<HeadTail<_8>>(dst, src, count);
   if (count < 32)
-    return CopyBlockOverlap<16>(dst, src, count);
+    return Copy<HeadTail<_16>>(dst, src, count);
   if (count < 64)
-    return CopyBlockOverlap<32>(dst, src, count);
+    return Copy<HeadTail<_32>>(dst, src, count);
   if (count < 128)
-    return CopyBlockOverlap<64>(dst, src, count);
+    return Copy<HeadTail<_64>>(dst, src, count);
   if (kHasAvx && count < 256)
-    return CopyBlockOverlap<128>(dst, src, count);
+    return Copy<HeadTail<_128>>(dst, src, count);
   if (count <= kRepMovsBSize)
-    return CopyDstAlignedBlocks<kLoopCopyBlockSize, 32>(dst, src, count);
+    return Copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
+                                                                 count);
   return CopyRepMovsb(dst, src, count);
 }
 

diff  --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index 068a9ec18035c..a1e2c90234063 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -3,15 +3,13 @@ add_libc_unittest(
   SUITE
     libc_string_unittests
   SRCS
+    elements_test.cpp
+    memory_access_test.cpp
     utils_test.cpp
-    memcpy_utils_test.cpp
   DEPENDS
     libc.src.string.memory_utils.memory_utils
     libc.utils.CPP.standalone_cpp
-)
-
-target_compile_definitions(
-  libc.test.src.string.memory_utils.utils_test
-  PRIVATE
-  LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor
-)
+  COMPILE_OPTIONS
+    ${LIBC_COMPILE_OPTIONS_NATIVE}
+    -ffreestanding
+)
\ No newline at end of file

diff  --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp
new file mode 100644
index 0000000000000..120cd6ff8761e
--- /dev/null
+++ b/libc/test/src/string/memory_utils/elements_test.cpp
@@ -0,0 +1,103 @@
+//===-- Unittests for memory_utils ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memory_utils/elements.h"
+#include "utils/CPP/Array.h"
+#include "utils/UnitTest/Test.h"
+
+namespace __llvm_libc {
+
+// Registering Types
+using FixedSizeTypes = testing::TypeList<
+#ifdef __SSE2__
+    x86::Vector128, //
+#endif              // __SSE2__
+#ifdef __AVX2__
+    x86::Vector256, //
+#endif              // __AVX2__
+#if defined(__AVX512F__) and defined(__AVX512BW__)
+    x86::Vector512, //
+#endif              // defined(__AVX512F__) and defined(__AVX512BW__)
+    scalar::UINT8,  //
+    scalar::UINT16, //
+    scalar::UINT32, //
+    scalar::UINT64, //
+    Repeated<scalar::UINT64, 2>,                            //
+    Repeated<scalar::UINT64, 4>,                            //
+    Repeated<scalar::UINT64, 8>,                            //
+    Repeated<scalar::UINT64, 16>,                           //
+    Repeated<scalar::UINT64, 32>,                           //
+    Chained<scalar::UINT16, scalar::UINT8>,                 //
+    Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, //
+    builtin::_1,                                            //
+    builtin::_2,                                            //
+    builtin::_3,                                            //
+    builtin::_4,                                            //
+    builtin::_8                                             //
+    >;
+
+char GetRandomChar() {
+  static constexpr const uint64_t a = 1103515245;
+  static constexpr const uint64_t c = 12345;
+  static constexpr const uint64_t m = 1ULL << 31;
+  static uint64_t seed = 123456789;
+  seed = (a * seed + c) % m;
+  return seed;
+}
+
+template <typename Element> using Buffer = cpp::Array<char, Element::kSize>;
+template <typename Element> Buffer<Element> GetRandomBuffer() {
+  Buffer<Element> buffer;
+  for (auto &current : buffer)
+    current = GetRandomChar();
+  return buffer;
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Copy, FixedSizeTypes) {
+  Buffer<ParamType> Dst;
+  const auto buffer = GetRandomBuffer<ParamType>();
+  Copy<ParamType>(Dst.data(), buffer.data());
+  for (size_t i = 0; i < ParamType::kSize; ++i)
+    EXPECT_EQ(Dst[i], buffer[i]);
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) {
+  const auto buffer = GetRandomBuffer<ParamType>();
+  EXPECT_TRUE(Equals<ParamType>(buffer.data(), buffer.data()));
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, ThreeWayCompare, FixedSizeTypes) {
+  Buffer<ParamType> initial;
+  for (auto &c : initial)
+    c = 5;
+
+  // Testing equality
+  EXPECT_EQ(ThreeWayCompare<ParamType>(initial.data(), initial.data()), 0);
+
+  // Testing all mismatching positions
+  for (size_t i = 0; i < ParamType::kSize; ++i) {
+    auto copy = initial;
+    ++copy[i]; // Copy is now lexicographycally greated than initial
+    const auto *less = initial.data();
+    const auto *greater = copy.data();
+    EXPECT_LT(ThreeWayCompare<ParamType>(less, greater), 0);
+    EXPECT_GT(ThreeWayCompare<ParamType>(greater, less), 0);
+  }
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) {
+  Buffer<ParamType> Dst;
+  const cpp::Array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)};
+  for (char value : values) {
+    SplatSet<ParamType>(Dst.data(), value);
+    for (size_t i = 0; i < ParamType::kSize; ++i)
+      EXPECT_EQ(Dst[i], value);
+  }
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
deleted file mode 100644
index 37529ae3d8ada..0000000000000
--- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
+++ /dev/null
@@ -1,336 +0,0 @@
-//===-- Unittests for memory_utils ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/string/memory_utils/memcpy_utils.h"
-#include "utils/CPP/Array.h"
-#include "utils/UnitTest/Test.h"
-
-#include <assert.h>
-#include <stdint.h> // uintptr_t
-
-#ifndef LLVM_LIBC_MEMCPY_MONITOR
-#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test.
-#endif
-
-namespace __llvm_libc {
-
-struct Buffer {
-  static constexpr size_t kMaxBuffer = 1024;
-  char buffer[kMaxBuffer + 1];
-  size_t last = 0;
-
-  void Clear() {
-    last = 0;
-    for (size_t i = 0; i < kMaxBuffer; ++i)
-      buffer[i] = '0';
-    buffer[kMaxBuffer] = '\0';
-  }
-
-  void Increment(const void *ptr) {
-    const auto offset = reinterpret_cast<uintptr_t>(ptr);
-    assert(offset < kMaxBuffer);
-    ++buffer[offset];
-    if (offset > last)
-      last = offset;
-  }
-
-  char *Finish() {
-    assert(last < kMaxBuffer);
-    buffer[last + 1] = '\0';
-    return buffer;
-  }
-};
-
-struct Trace {
-  Buffer read;
-  Buffer write;
-
-  void Add(char *__restrict dst, const char *__restrict src, size_t count) {
-    for (size_t i = 0; i < count; ++i)
-      read.Increment(src + i);
-    for (size_t i = 0; i < count; ++i)
-      write.Increment(dst + i);
-  }
-
-  void Clear() {
-    read.Clear();
-    write.Clear();
-  }
-
-  char *Read() { return read.Finish(); }
-  char *Write() { return write.Finish(); }
-};
-
-static Trace &GetTrace() {
-  static thread_local Trace events;
-  return events;
-}
-
-extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst,
-                                         const char *__restrict src,
-                                         size_t count) {
-  GetTrace().Add(dst, src, count);
-}
-
-char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); }
-
-TEST(LlvmLibcMemcpyUtilsTest, CopyTrivial) {
-  auto &trace = GetTrace();
-
-  trace.Clear();
-  CopyBlock<1>(I(0), I(0));
-  EXPECT_STREQ(trace.Write(), "1");
-  EXPECT_STREQ(trace.Read(), "1");
-
-  trace.Clear();
-  CopyBlock<2>(I(0), I(0));
-  EXPECT_STREQ(trace.Write(), "11");
-  EXPECT_STREQ(trace.Read(), "11");
-
-  trace.Clear();
-  CopyBlock<4>(I(0), I(0));
-  EXPECT_STREQ(trace.Write(), "1111");
-  EXPECT_STREQ(trace.Read(), "1111");
-
-  trace.Clear();
-  CopyBlock<8>(I(0), I(0));
-  EXPECT_STREQ(trace.Write(), "11111111");
-  EXPECT_STREQ(trace.Read(), "11111111");
-
-  trace.Clear();
-  CopyBlock<16>(I(0), I(0));
-  EXPECT_STREQ(trace.Write(), "1111111111111111");
-  EXPECT_STREQ(trace.Read(), "1111111111111111");
-
-  trace.Clear();
-  CopyBlock<32>(I(0), I(0));
-  EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
-  EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
-
-  trace.Clear();
-  CopyBlock<64>(I(0), I(0));
-  EXPECT_STREQ(
-      trace.Write(),
-      "1111111111111111111111111111111111111111111111111111111111111111");
-  EXPECT_STREQ(
-      trace.Read(),
-      "1111111111111111111111111111111111111111111111111111111111111111");
-}
-
-TEST(LlvmLibcMemcpyUtilsTest, CopyOffset) {
-  auto &trace = GetTrace();
-
-  trace.Clear();
-  CopyBlock<1>(I(3), I(1));
-  EXPECT_STREQ(trace.Write(), "0001");
-  EXPECT_STREQ(trace.Read(), "01");
-
-  trace.Clear();
-  CopyBlock<1>(I(2), I(1));
-  EXPECT_STREQ(trace.Write(), "001");
-  EXPECT_STREQ(trace.Read(), "01");
-}
-
-TEST(LlvmLibcMemcpyUtilsTest, CopyBlockOverlap) {
-  auto &trace = GetTrace();
-
-  trace.Clear();
-  CopyBlockOverlap<2>(I(0), I(0), 2);
-  EXPECT_STREQ(trace.Write(), "22");
-  EXPECT_STREQ(trace.Read(), "22");
-
-  trace.Clear();
-  CopyBlockOverlap<2>(I(0), I(0), 3);
-  EXPECT_STREQ(trace.Write(), "121");
-  EXPECT_STREQ(trace.Read(), "121");
-
-  trace.Clear();
-  CopyBlockOverlap<2>(I(0), I(0), 4);
-  EXPECT_STREQ(trace.Write(), "1111");
-  EXPECT_STREQ(trace.Read(), "1111");
-
-  trace.Clear();
-  CopyBlockOverlap<4>(I(2), I(1), 7);
-  EXPECT_STREQ(trace.Write(), "001112111");
-  EXPECT_STREQ(trace.Read(), "01112111");
-}
-
-TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) {
-  auto &trace = GetTrace();
-  // Source is aligned and multiple of alignment.
-  //   "1111"
-  trace.Clear();
-  CopySrcAlignedBlocks<4>(I(0), I(0), 4);
-  EXPECT_STREQ(trace.Write(), "2222");
-  EXPECT_STREQ(trace.Read(), "2222");
-
-  // Source is aligned and multiple of alignment.
-  //   "11110000"
-  // + "00001111"
-  // = "11111111"
-  trace.Clear();
-  CopySrcAlignedBlocks<4>(I(0), I(0), 8);
-  EXPECT_STREQ(trace.Write(), "11111111");
-  EXPECT_STREQ(trace.Read(), "11111111");
-
-  // Source is aligned already overlap at end.
-  //   "1111000000000"
-  // + "0000111100000"
-  // + "0000000011110"
-  // + "0000000001111"
-  // = "1111111112221"
-  trace.Clear();
-  CopySrcAlignedBlocks<4>(I(0), I(0), 13);
-  EXPECT_STREQ(trace.Write(), "1111111112221");
-  EXPECT_STREQ(trace.Read(), "1111111112221");
-
-  // Misaligned source.
-  //   "01111000000000"
-  // + "00001111000000"
-  // + "00000000111100"
-  // + "00000000001111"
-  // = "01112111112211"
-  trace.Clear();
-  CopySrcAlignedBlocks<4>(I(0), I(1), 13);
-  EXPECT_STREQ(trace.Write(), "1112111112211");
-  EXPECT_STREQ(trace.Read(), "01112111112211");
-
-  // Misaligned source aligned at end.
-  //   "011110000000"
-  // + "000011110000"
-  // + "000000001111"
-  // = "011121111111"
-  trace.Clear();
-  CopySrcAlignedBlocks<4>(I(0), I(1), 11);
-  EXPECT_STREQ(trace.Write(), "11121111111");
-  EXPECT_STREQ(trace.Read(), "011121111111");
-}
-
-TEST(LlvmLibcMemcpyUtilsTest, CopyDstAlignedBlocks) {
-  auto &trace = GetTrace();
-  // Destination is aligned and multiple of alignment.
-  //   "1111"
-  trace.Clear();
-  CopyDstAlignedBlocks<4>(I(0), I(0), 4);
-  EXPECT_STREQ(trace.Write(), "2222");
-  EXPECT_STREQ(trace.Read(), "2222");
-
-  // Destination is aligned and multiple of alignment.
-  //   "11110000"
-  // + "00001111"
-  // = "11111111"
-  trace.Clear();
-  CopyDstAlignedBlocks<4>(I(0), I(0), 8);
-  EXPECT_STREQ(trace.Write(), "11111111");
-  EXPECT_STREQ(trace.Read(), "11111111");
-
-  // Destination is aligned already overlap at end.
-  //   "1111000000000"
-  // + "0000111100000"
-  // + "0000000011110"
-  // + "0000000001111"
-  // = "1111111112221"
-  trace.Clear();
-  CopyDstAlignedBlocks<4>(I(0), I(0), 13);
-  EXPECT_STREQ(trace.Write(), "1111111112221");
-  EXPECT_STREQ(trace.Read(), "1111111112221");
-
-  // Misaligned destination.
-  //   "01111000000000"
-  // + "00001111000000"
-  // + "00000000111100"
-  // + "00000000001111"
-  // = "01112111112211"
-  trace.Clear();
-  CopyDstAlignedBlocks<4>(I(1), I(0), 13);
-  EXPECT_STREQ(trace.Write(), "01112111112211");
-  EXPECT_STREQ(trace.Read(), "1112111112211");
-
-  // Misaligned destination aligned at end.
-  //   "011110000000"
-  // + "000011110000"
-  // + "000000001111"
-  // = "011121111111"
-  trace.Clear();
-  CopyDstAlignedBlocks<4>(I(1), I(0), 11);
-  EXPECT_STREQ(trace.Write(), "011121111111");
-  EXPECT_STREQ(trace.Read(), "11121111111");
-}
-
-TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignment) {
-  auto &trace = GetTrace();
-  // Source is aligned and multiple of alignment.
-  //   "11111111"
-  trace.Clear();
-  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8);
-  EXPECT_STREQ(trace.Write(), "22221111");
-  EXPECT_STREQ(trace.Read(), "22221111");
-
-  // Destination is aligned and multiple of alignment.
-  //   "11111111"
-  trace.Clear();
-  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 8);
-  EXPECT_STREQ(trace.Write(), "22221111");
-  EXPECT_STREQ(trace.Read(), "22221111");
-
-  // Source is aligned and multiple of alignment.
-  //   "111111111"
-  trace.Clear();
-  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9);
-  EXPECT_STREQ(trace.Write(), "122211111");
-  EXPECT_STREQ(trace.Read(), "122211111");
-
-  // Destination is aligned and multiple of alignment.
-  //   "111111111"
-  trace.Clear();
-  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 9);
-  EXPECT_STREQ(trace.Write(), "122211111");
-  EXPECT_STREQ(trace.Read(), "122211111");
-}
-
-TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksMaxReloads) {
-  auto &trace = GetTrace();
-  for (size_t alignment = 0; alignment < 32; ++alignment) {
-    for (size_t count = 64; count < 768; ++count) {
-      trace.Clear();
-      // We should never reload more than twice when copying from count = 2x32.
-      CopySrcAlignedBlocks<32>(I(alignment), I(0), count);
-      const char *const written = trace.Write();
-      // First bytes are untouched.
-      for (size_t i = 0; i < alignment; ++i)
-        EXPECT_EQ(written[i], '0');
-      // Next bytes are loaded once or twice but no more.
-      for (size_t i = alignment; i < count; ++i) {
-        EXPECT_GE(written[i], '1');
-        EXPECT_LE(written[i], '2');
-      }
-    }
-  }
-}
-
-TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignmentMaxReloads) {
-  auto &trace = GetTrace();
-  for (size_t alignment = 0; alignment < 32; ++alignment) {
-    for (size_t count = 64; count < 768; ++count) {
-      trace.Clear();
-      // We should never reload more than twice when copying from count = 2x32.
-      CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count);
-      const char *const written = trace.Write();
-      // First bytes are untouched.
-      for (size_t i = 0; i < alignment; ++i)
-        EXPECT_EQ(written[i], '0');
-      // Next bytes are loaded once or twice but no more.
-      for (size_t i = alignment; i < count; ++i) {
-        EXPECT_GE(written[i], '1');
-        EXPECT_LE(written[i], '2');
-      }
-    }
-  }
-}
-
-} // namespace __llvm_libc

diff  --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp
new file mode 100644
index 0000000000000..4a0bff27cd5f9
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memory_access_test.cpp
@@ -0,0 +1,228 @@
+//===-- Unittests for memory_utils ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define LLVM_LIBC_UNITTEST_OBSERVE 1
+
+#include "src/string/memory_utils/elements.h"
+#include "utils/CPP/Array.h"
+#include "utils/CPP/ArrayRef.h"
+#include "utils/UnitTest/Test.h"
+
+#include <stdio.h>
+#include <string.h>
+
+namespace __llvm_libc {
+
+static constexpr const size_t kMaxBuffer = 32;
+
+struct BufferAccess : cpp::Array<char, kMaxBuffer + 1> {
+  BufferAccess() { Reset(); }
+  void Reset() {
+    for (auto &value : *this)
+      value = '0';
+    this->operator[](kMaxBuffer) = '\0';
+  }
+  void Touch(ptr
diff _t offset, size_t size) {
+    if (offset < 0)
+      return;
+    for (size_t i = 0; i < size; ++i)
+      ++(*this)[offset + i];
+  }
+  operator const char *() const { return this->data(); }
+};
+
+struct Buffer {
+  ptr
diff _t Offset(const char *ptr) const {
+    const bool contained = ptr >= data.begin() && ptr < data.end();
+    return contained ? ptr - data.begin() : -1;
+  }
+  void Reset() {
+    reads.Reset();
+    writes.Reset();
+  }
+  cpp::Array<char, kMaxBuffer> data;
+  BufferAccess __attribute__((aligned(64))) reads;
+  BufferAccess __attribute__((aligned(64))) writes;
+};
+
+struct MemoryAccessObserver {
+  void ObserveRead(const char *ptr, size_t size) {
+    Buffer1.reads.Touch(Buffer1.Offset(ptr), size);
+    Buffer2.reads.Touch(Buffer2.Offset(ptr), size);
+  }
+
+  void ObserveWrite(const char *ptr, size_t size) {
+    Buffer1.writes.Touch(Buffer1.Offset(ptr), size);
+    Buffer2.writes.Touch(Buffer2.Offset(ptr), size);
+  }
+
+  void Reset() {
+    Buffer1.Reset();
+    Buffer2.Reset();
+  }
+
+  Buffer Buffer1;
+  Buffer Buffer2;
+};
+
+MemoryAccessObserver Observer;
+
+template <size_t Size> struct TestingElement {
+  static constexpr size_t kSize = Size;
+
+  static void Copy(char *__restrict dst, const char *__restrict src) {
+    Observer.ObserveRead(src, kSize);
+    Observer.ObserveWrite(dst, kSize);
+  }
+
+  static bool Equals(const char *lhs, const char *rhs) {
+    Observer.ObserveRead(lhs, kSize);
+    Observer.ObserveRead(rhs, kSize);
+    return true;
+  }
+
+  static int ThreeWayCompare(const char *lhs, const char *rhs) {
+    Observer.ObserveRead(lhs, kSize);
+    Observer.ObserveRead(rhs, kSize);
+    return 0;
+  }
+
+  static void SplatSet(char *dst, const unsigned char value) {
+    Observer.ObserveWrite(dst, kSize);
+  }
+};
+
+using Types = testing::TypeList<
+    TestingElement<1>,                                               // 1 Byte
+    TestingElement<2>,                                               // 2 Bytes
+    TestingElement<4>,                                               // 4 Bytes
+    Repeated<TestingElement<2>, 3>,                                  // 6 Bytes
+    Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes
+    >;
+
+struct LlvmLibcTestAccessBase : public testing::Test {
+
+  template <typename HigherOrder, size_t Size, size_t Offset = 0>
+  void checkOperations(const BufferAccess &expected) {
+    static const BufferAccess untouched;
+
+    Observer.Reset();
+    HigherOrder::Copy(dst_ptr() + Offset, src_ptr() + Offset, Size);
+    ASSERT_STREQ(src().writes, untouched);
+    ASSERT_STREQ(dst().reads, untouched);
+    ASSERT_STREQ(src().reads, expected);
+    ASSERT_STREQ(dst().writes, expected);
+    Observer.Reset();
+    HigherOrder::Equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
+    ASSERT_STREQ(lhs().writes, untouched);
+    ASSERT_STREQ(rhs().writes, untouched);
+    ASSERT_STREQ(lhs().reads, expected);
+    ASSERT_STREQ(rhs().reads, expected);
+    Observer.Reset();
+    HigherOrder::ThreeWayCompare(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
+    ASSERT_STREQ(lhs().writes, untouched);
+    ASSERT_STREQ(rhs().writes, untouched);
+    ASSERT_STREQ(lhs().reads, expected);
+    ASSERT_STREQ(rhs().reads, expected);
+    Observer.Reset();
+    HigherOrder::SplatSet(dst_ptr() + Offset, 5, Size);
+    ASSERT_STREQ(src().reads, untouched);
+    ASSERT_STREQ(src().writes, untouched);
+    ASSERT_STREQ(dst().reads, untouched);
+    ASSERT_STREQ(dst().writes, expected);
+  }
+
+  void checkMaxAccess(const BufferAccess &expected, int max) {
+    for (size_t i = 0; i < kMaxBuffer; ++i) {
+      int value = (int)expected[i] - '0';
+      ASSERT_GE(value, 0);
+      ASSERT_LE(value, max);
+    }
+  }
+
+private:
+  const Buffer &lhs() const { return Observer.Buffer1; }
+  const Buffer &rhs() const { return Observer.Buffer2; }
+  const Buffer &src() const { return Observer.Buffer2; }
+  const Buffer &dst() const { return Observer.Buffer1; }
+  Buffer &dst() { return Observer.Buffer1; }
+
+  char *dst_ptr() { return dst().data.begin(); }
+  const char *src_ptr() { return src().data.begin(); }
+  const char *lhs_ptr() { return lhs().data.begin(); }
+  const char *rhs_ptr() { return rhs().data.begin(); }
+};
+
+template <typename ParamType>
+struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase {
+
+  void TearDown() override {
+    static constexpr size_t Size = 10;
+
+    BufferAccess expected;
+    expected.Touch(Size - ParamType::kSize, ParamType::kSize);
+
+    checkMaxAccess(expected, 1);
+    checkOperations<Tail<ParamType>, Size>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase {
+  void TearDown() override {
+    static constexpr size_t Size = 10;
+
+    BufferAccess expected;
+    expected.Touch(0, ParamType::kSize);
+    expected.Touch(Size - ParamType::kSize, ParamType::kSize);
+
+    checkMaxAccess(expected, 2);
+    checkOperations<HeadTail<ParamType>, Size>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase {
+  void TearDown() override {
+    static constexpr size_t Size = 20;
+
+    BufferAccess expected;
+    for (size_t i = 0; i < Size - ParamType::kSize; i += ParamType::kSize)
+      expected.Touch(i, ParamType::kSize);
+    expected.Touch(Size - ParamType::kSize, ParamType::kSize);
+
+    checkMaxAccess(expected, 2);
+    checkOperations<Loop<ParamType>, Size>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase {
+  void TearDown() override {
+    static constexpr size_t Size = 10;
+    static constexpr size_t Offset = 2;
+    using AlignmentT = TestingElement<4>;
+
+    BufferAccess expected;
+    expected.Touch(Offset, AlignmentT::kSize);
+    expected.Touch(AlignmentT::kSize, ParamType::kSize);
+    expected.Touch(Offset + Size - ParamType::kSize, ParamType::kSize);
+
+    checkMaxAccess(expected, 3);
+    checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size,
+                    Offset>(expected);
+    checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size,
+                    Offset>(expected);
+  }
+};
+TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {}
+
+} // namespace __llvm_libc


        


More information about the libc-commits mailing list