[libc-commits] [libc] c11032a - Revert "[libc] Add a set of elementary operations"

Guillaume Chatelet via libc-commits libc-commits at lists.llvm.org
Tue Jun 15 01:02:15 PDT 2021


Author: Guillaume Chatelet
Date: 2021-06-15T08:01:59Z
New Revision: c11032ad9a6c4523d0d9d591ebce82c87a9fb3a0

URL: https://github.com/llvm/llvm-project/commit/c11032ad9a6c4523d0d9d591ebce82c87a9fb3a0
DIFF: https://github.com/llvm/llvm-project/commit/c11032ad9a6c4523d0d9d591ebce82c87a9fb3a0.diff

LOG: Revert "[libc] Add a set of elementary operations"

This reverts commit 454d92ac3b3b13f5c8b3f57e03b2d93f0cf60738.

Added: 
    libc/src/string/memory_utils/memcpy_utils.h
    libc/test/src/string/memory_utils/memcpy_utils_test.cpp

Modified: 
    libc/src/string/CMakeLists.txt
    libc/src/string/aarch64/memcpy.cpp
    libc/src/string/memcpy.cpp
    libc/src/string/memory_utils/CMakeLists.txt
    libc/src/string/memory_utils/memset_utils.h
    libc/src/string/x86_64/memcpy.cpp
    libc/test/src/string/memory_utils/CMakeLists.txt

Removed: 
    libc/src/string/memory_utils/elements.h
    libc/src/string/memory_utils/elements_x86.h
    libc/test/src/string/memory_utils/elements_test.cpp
    libc/test/src/string/memory_utils/memory_access_test.cpp


################################################################################
diff  --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index 4dd8ee0bf4a2c..f7a0406127143 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -194,7 +194,7 @@ function(add_implementation name impl_name)
     SRCS ${ADD_IMPL_SRCS}
     HDRS ${ADD_IMPL_HDRS}
     DEPENDS ${ADD_IMPL_DEPENDS}
-    COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} "SHELL:-mllvm -combiner-global-alias-analysis"
+    COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS}
   )
   get_fq_target_name(${impl_name} fq_target_name)
   set_target_properties(${fq_target_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_IMPL_REQUIRE}")

diff  --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp
index 1a1fbbc026a73..78988ec37afe6 100644
--- a/libc/src/string/aarch64/memcpy.cpp
+++ b/libc/src/string/aarch64/memcpy.cpp
@@ -8,19 +8,10 @@
 
 #include "src/string/memcpy.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/memcpy_utils.h"
 
 namespace __llvm_libc {
 
-using _1 = scalar::UINT8;
-using _2 = scalar::UINT16;
-using _3 = Chained<scalar::UINT16, scalar::UINT8>;
-using _4 = scalar::UINT32;
-using _8 = scalar::UINT64;
-using _16 = Repeated<scalar::UINT64, 2>;
-using _32 = Repeated<scalar::UINT64, 4>;
-using _64 = Repeated<scalar::UINT64, 8>;
-
 // Design rationale
 // ================
 //
@@ -46,24 +37,24 @@ static void memcpy_aarch64(char *__restrict dst, const char *__restrict src,
   if (count == 0)
     return;
   if (count == 1)
-    return Copy<_1>(dst, src);
+    return CopyBlock<1>(dst, src);
   if (count == 2)
-    return Copy<_2>(dst, src);
+    return CopyBlock<2>(dst, src);
   if (count == 3)
-    return Copy<_3>(dst, src);
+    return CopyBlock<3>(dst, src);
   if (count == 4)
-    return Copy<_4>(dst, src);
+    return CopyBlock<4>(dst, src);
   if (count < 8)
-    return Copy<HeadTail<_4>>(dst, src, count);
+    return CopyBlockOverlap<4>(dst, src, count);
   if (count < 16)
-    return Copy<HeadTail<_8>>(dst, src, count);
+    return CopyBlockOverlap<8>(dst, src, count);
   if (count < 32)
-    return Copy<HeadTail<_16>>(dst, src, count);
+    return CopyBlockOverlap<16>(dst, src, count);
   if (count < 64)
-    return Copy<HeadTail<_32>>(dst, src, count);
+    return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
-    return Copy<HeadTail<_64>>(dst, src, count);
-  return Copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
+    return CopyBlockOverlap<64>(dst, src, count);
+  return CopySrcAlignedBlocks<64, 16>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,

diff  --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
index 5e70e00db1b91..e050d7f4ffa2d 100644
--- a/libc/src/string/memcpy.cpp
+++ b/libc/src/string/memcpy.cpp
@@ -8,7 +8,7 @@
 
 #include "src/string/memcpy.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/memcpy_utils.h"
 
 namespace __llvm_libc {
 
@@ -32,30 +32,27 @@ namespace __llvm_libc {
 //   with little change on the code side.
 static void memcpy_impl(char *__restrict dst, const char *__restrict src,
                         size_t count) {
-  // Use scalar strategies (_1, _2, _3 ...)
-  using namespace __llvm_libc::scalar;
-
   if (count == 0)
     return;
   if (count == 1)
-    return Copy<_1>(dst, src);
+    return CopyBlock<1>(dst, src);
   if (count == 2)
-    return Copy<_2>(dst, src);
+    return CopyBlock<2>(dst, src);
   if (count == 3)
-    return Copy<_3>(dst, src);
+    return CopyBlock<3>(dst, src);
   if (count == 4)
-    return Copy<_4>(dst, src);
+    return CopyBlock<4>(dst, src);
   if (count < 8)
-    return Copy<HeadTail<_4>>(dst, src, count);
+    return CopyBlockOverlap<4>(dst, src, count);
   if (count < 16)
-    return Copy<HeadTail<_8>>(dst, src, count);
+    return CopyBlockOverlap<8>(dst, src, count);
   if (count < 32)
-    return Copy<HeadTail<_16>>(dst, src, count);
+    return CopyBlockOverlap<16>(dst, src, count);
   if (count < 64)
-    return Copy<HeadTail<_32>>(dst, src, count);
+    return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
-    return Copy<HeadTail<_64>>(dst, src, count);
-  return Copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
+    return CopyBlockOverlap<64>(dst, src, count);
+  return CopySrcAlignedBlocks<32>(dst, src, count);
 }
 
 LLVM_LIBC_FUNCTION(void *, memcpy,

diff  --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 4a550e544dac8..327031ae67f8d 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -2,5 +2,6 @@ add_header_library(
   memory_utils
   HDRS
     utils.h
-    elements.h
+    memcpy_utils.h
+    memset_utils.h
 )

diff  --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
deleted file mode 100644
index 3225317e6aa42..0000000000000
--- a/libc/src/string/memory_utils/elements.h
+++ /dev/null
@@ -1,507 +0,0 @@
-//===-- Elementary operations to compose memory primitives ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
-
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#include "src/__support/endian.h"
-#include "src/string/memory_utils/utils.h"
-
-namespace __llvm_libc {
-
-// Elementary Operations
-// --------------------------------
-// We define abstract elementary operations acting on fixed chunks of memory.
-// These are low level building blocks that are meant to be assembled to compose
-// higher order abstractions. Each function is defined twice: once with
-// fixed-size operations, and once with runtime-size operations.
-
-// Fixed-size copies from 'src' to 'dst'.
-template <typename Element>
-void Copy(char *__restrict dst, const char *__restrict src) {
-  Element::Copy(dst, src);
-}
-// Runtime-size copies from 'src' to 'dst'.
-template <typename Element>
-void Copy(char *__restrict dst, const char *__restrict src, size_t size) {
-  Element::Copy(dst, src, size);
-}
-
-// Fixed-size equality between 'lhs' and 'rhs'.
-template <typename Element> bool Equals(const char *lhs, const char *rhs) {
-  return Element::Equals(lhs, rhs);
-}
-// Runtime-size equality between 'lhs' and 'rhs'.
-template <typename Element>
-bool Equals(const char *lhs, const char *rhs, size_t size) {
-  return Element::Equals(lhs, rhs, size);
-}
-
-// Fixed-size three-way comparison between 'lhs' and 'rhs'.
-template <typename Element>
-int ThreeWayCompare(const char *lhs, const char *rhs) {
-  return Element::ThreeWayCompare(lhs, rhs);
-}
-// Runtime-size three-way comparison between 'lhs' and 'rhs'.
-template <typename Element>
-int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
-  return Element::ThreeWayCompare(lhs, rhs, size);
-}
-
-// Fixed-size initialization.
-template <typename Element>
-void SplatSet(char *dst, const unsigned char value) {
-  Element::SplatSet(dst, value);
-}
-// Runtime-size initialization.
-template <typename Element>
-void SplatSet(char *dst, const unsigned char value, size_t size) {
-  Element::SplatSet(dst, value, size);
-}
-
-// Fixed-size Higher-Order Operations
-// ----------------------------------
-// - Repeated<Type, ElementCount>: Repeat the operation several times in a row.
-// - Chained<Types...>: Chain the operation of several types.
-
-// Repeat the operation several times in a row.
-template <typename Element, size_t ElementCount> struct Repeated {
-  static constexpr size_t kSize = ElementCount * Element::kSize;
-
-  static void Copy(char *__restrict dst, const char *__restrict src) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::kSize;
-      Element::Copy(dst + offset, src + offset);
-    }
-  }
-
-  static bool Equals(const char *lhs, const char *rhs) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::kSize;
-      if (!Element::Equals(lhs + offset, rhs + offset))
-        return false;
-    }
-    return true;
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::kSize;
-      // We make the assumption that 'Equals' si cheaper than 'ThreeWayCompare'.
-      if (Element::Equals(lhs + offset, rhs + offset))
-        continue;
-      return Element::ThreeWayCompare(lhs + offset, rhs + offset);
-    }
-    return 0;
-  }
-
-  static void SplatSet(char *dst, const unsigned char value) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::kSize;
-      Element::SplatSet(dst + offset, value);
-    }
-  }
-};
-
-// Chain the operation of several types.
-// For instance, to handle a 3 bytes operation, one can use:
-// Chained<UINT16, UINT8>::Operation();
-template <typename... Types> struct Chained;
-
-template <typename Head, typename... Tail> struct Chained<Head, Tail...> {
-  static constexpr size_t kSize = Head::kSize + Chained<Tail...>::kSize;
-
-  static void Copy(char *__restrict dst, const char *__restrict src) {
-    Chained<Tail...>::Copy(dst + Head::kSize, src + Head::kSize);
-    __llvm_libc::Copy<Head>(dst, src);
-  }
-
-  static bool Equals(const char *lhs, const char *rhs) {
-    if (!__llvm_libc::Equals<Head>(lhs, rhs))
-      return false;
-    return Chained<Tail...>::Equals(lhs + Head::kSize, rhs + Head::kSize);
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs) {
-    if (__llvm_libc::Equals<Head>(lhs, rhs))
-      return Chained<Tail...>::ThreeWayCompare(lhs + Head::kSize,
-                                               rhs + Head::kSize);
-    return __llvm_libc::ThreeWayCompare<Head>(lhs, rhs);
-  }
-
-  static void SplatSet(char *dst, const unsigned char value) {
-    Chained<Tail...>::SplatSet(dst + Head::kSize, value);
-    __llvm_libc::SplatSet<Head>(dst, value);
-  }
-};
-
-template <> struct Chained<> {
-  static constexpr size_t kSize = 0;
-  static void Copy(char *__restrict dst, const char *__restrict src) {}
-  static bool Equals(const char *lhs, const char *rhs) { return true; }
-  static int ThreeWayCompare(const char *lhs, const char *rhs) { return 0; }
-  static void SplatSet(char *dst, const unsigned char value) {}
-};
-
-// Runtime-size Higher-Order Operations
-// ------------------------------------
-// - Tail<T>: Perform the operation on the last 'T::kSize' bytes of the buffer.
-// - HeadTail<T>: Perform the operation on the first and last 'T::kSize' bytes
-//   of the buffer.
-// - Loop<T>: Perform a loop of fixed-sized operations.
-
-// Perform the operation on the last 'T::kSize' bytes of the buffer.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [________XXXXXXXX___]
-//
-// Precondition: `size >= T::kSize`.
-template <typename T> struct Tail {
-  static void Copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    return T::Copy(dst + offset(size), src + offset(size));
-  }
-
-  static bool Equals(const char *lhs, const char *rhs, size_t size) {
-    return T::Equals(lhs + offset(size), rhs + offset(size));
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
-    return T::ThreeWayCompare(lhs + offset(size), rhs + offset(size));
-  }
-
-  static void SplatSet(char *dst, const unsigned char value, size_t size) {
-    return T::SplatSet(dst + offset(size), value);
-  }
-
-  static size_t offset(size_t size) { return size - T::kSize; }
-};
-
-// Perform the operation on the first and last 'T::kSize' bytes of the buffer.
-// This is useful for overlapping operations.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [__XXXXXXXX_________]
-// [________XXXXXXXX___]
-//
-// Precondition: `size >= T::kSize && size <= 2 x T::kSize`.
-template <typename T> struct HeadTail {
-  static void Copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    T::Copy(dst, src);
-    Tail<T>::Copy(dst, src, size);
-  }
-
-  static bool Equals(const char *lhs, const char *rhs, size_t size) {
-    if (!T::Equals(lhs, rhs))
-      return false;
-    return Tail<T>::Equals(lhs, rhs, size);
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
-    if (const int result = T::ThreeWayCompare(lhs, rhs))
-      return result;
-    return Tail<T>::ThreeWayCompare(lhs, rhs, size);
-  }
-
-  static void SplatSet(char *dst, const unsigned char value, size_t size) {
-    T::SplatSet(dst, value);
-    Tail<T>::SplatSet(dst, value, size);
-  }
-};
-
-// Simple loop ending with a Tail operation.
-//
-// e.g. with
-// [12345678123456781234567812345678]
-// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-// [__XXXXXXXX_______________________]
-// [__________XXXXXXXX_______________]
-// [__________________XXXXXXXX_______]
-// [______________________XXXXXXXX___]
-//
-// Precondition:
-// - size >= T::kSize
-template <typename T> struct Loop {
-  static void Copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
-      T::Copy(dst + offset, src + offset);
-    Tail<T>::Copy(dst, src, size);
-  }
-
-  static bool Equals(const char *lhs, const char *rhs, size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
-      if (!T::Equals(lhs + offset, rhs + offset))
-        return false;
-    return Tail<T>::Equals(lhs, rhs, size);
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
-      if (const int result = T::ThreeWayCompare(lhs + offset, rhs + offset))
-        return result;
-    return Tail<T>::ThreeWayCompare(lhs, rhs, size);
-  }
-
-  static void SplatSet(char *dst, const unsigned char value, size_t size) {
-    for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize)
-      T::SplatSet(dst + offset, value);
-    Tail<T>::SplatSet(dst, value, size);
-  }
-};
-
-enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
-
-namespace internal {
-
-// Provides a specialized Bump function that adjusts pointers and size so first
-// argument (resp. second argument) gets aligned to Alignment.
-// We make sure the compiler knows about the adjusted pointer alignment.
-template <Arg arg, size_t Alignment> struct AlignHelper {};
-
-template <size_t Alignment> struct AlignHelper<Arg::_1, Alignment> {
-  template <typename T1, typename T2>
-  static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) {
-    const intptr_t offset = offset_to_next_aligned<Alignment>(p1ref);
-    p1ref += offset;
-    p2ref += offset;
-    size -= offset;
-    p1ref = assume_aligned<Alignment>(p1ref);
-  }
-};
-
-template <size_t Alignment> struct AlignHelper<Arg::_2, Alignment> {
-  template <typename T1, typename T2>
-  static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) {
-    const intptr_t offset = offset_to_next_aligned<Alignment>(p2ref);
-    p1ref += offset;
-    p2ref += offset;
-    size -= offset;
-    p2ref = assume_aligned<Alignment>(p2ref);
-  }
-};
-
-} // namespace internal
-
-// An alignment operation that:
-// - executes the 'AlignmentT' operation
-// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected
-//   pointer gets aligned, size is decreased accordingly.
-// - calls the 'NextT' operation.
-//
-// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
-// Copy<Align<_16, Arg::Dst>::Then<Loop<_32>>>(dst, src, count);
-template <typename AlignmentT, Arg AlignOn> struct Align {
-private:
-  static constexpr size_t Alignment = AlignmentT::kSize;
-  static_assert(Alignment > 1, "Alignment must be more than 1");
-  static_assert(is_power2(Alignment), "Alignment must be a power of 2");
-
-public:
-  template <typename NextT> struct Then {
-    static void Copy(char *__restrict dst, const char *__restrict src,
-                     size_t size) {
-      AlignmentT::Copy(dst, src);
-      internal::AlignHelper<AlignOn, Alignment>::Bump(dst, src, size);
-      NextT::Copy(dst, src, size);
-    }
-
-    static bool Equals(const char *lhs, const char *rhs, size_t size) {
-      if (!AlignmentT::Equals(lhs, rhs))
-        return false;
-      internal::AlignHelper<AlignOn, Alignment>::Bump(lhs, rhs, size);
-      return NextT::Equals(lhs, rhs, size);
-    }
-
-    static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) {
-      if (const int result = AlignmentT::ThreeWayCompare(lhs, rhs))
-        return result;
-      internal::AlignHelper<AlignOn, Alignment>::Bump(lhs, rhs, size);
-      return NextT::ThreeWayCompare(lhs, rhs, size);
-    }
-
-    static void SplatSet(char *dst, const unsigned char value, size_t size) {
-      AlignmentT::SplatSet(dst, value);
-      char *dummy = nullptr;
-      internal::AlignHelper<Arg::_1, Alignment>::Bump(dst, dummy, size);
-      NextT::SplatSet(dst, value, size);
-    }
-  };
-};
-
-// Fixed-size Builtin Operations
-// -----------------------------
-// Note: Do not use 'builtin' right now as it requires the implementation of the
-// `_inline` versions of all the builtins. Theoretically, Clang can still turn
-// them into calls to the C library leading to reentrancy problems.
-namespace builtin {
-
-// __builtin_memcpy_inline guarantees to never call external functions.
-// Unfortunately it is not widely available.
-#ifdef __clang__
-#if __has_builtin(__builtin_memcpy_inline)
-#define USE_BUILTIN_MEMCPY_INLINE
-#endif
-#elif defined(__GNUC__)
-#define USE_BUILTIN_MEMCPY
-#endif
-
-template <size_t Size> struct Builtin {
-  static constexpr size_t kSize = Size;
-
-  static void Copy(char *__restrict dst, const char *__restrict src) {
-#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
-    ForLoopCopy<kBlockSize>(dst, src);
-#elif defined(USE_BUILTIN_MEMCPY_INLINE)
-    __builtin_memcpy_inline(dst, src, kSize);
-#elif defined(USE_BUILTIN_MEMCPY)
-    __builtin_memcpy(dst, src, kSize);
-#else
-    ForLoopCopy<kBlockSize>(dst, src);
-#endif
-  }
-
-  static bool Equals(const char *lhs, const char *rhs) {
-    return __builtin_memcmp(lhs, rhs, kSize) == 0;
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs) {
-    return __builtin_memcmp(lhs, rhs, kSize);
-  }
-
-  static void SplatSet(char *dst, const unsigned char value) {
-    __builtin_memset(dst, value, kSize);
-  }
-
-private:
-  // Copies `kBlockSize` bytes from `src` to `dst` using a for loop.
-  // This code requires the use of `-fno-buitin-memcpy` to prevent the compiler
-  // from turning the for-loop back into `__builtin_memcpy`.
-  template <size_t kBlockSize>
-  static void ForLoopCopy(char *__restrict dst, const char *__restrict src) {
-    for (size_t i = 0; i < kBlockSize; ++i)
-      dst[i] = src[i];
-  }
-};
-
-#undef USE_BUILTIN_MEMCPY_INLINE
-#undef USE_BUILTIN_MEMCPY
-
-using _1 = Builtin<1>;
-using _2 = Builtin<2>;
-using _3 = Builtin<3>;
-using _4 = Builtin<4>;
-using _8 = Builtin<8>;
-using _16 = Builtin<16>;
-using _32 = Builtin<32>;
-using _64 = Builtin<64>;
-using _128 = Builtin<128>;
-
-} // namespace builtin
-
-// Fixed-size Scalar Operations
-// ----------------------------
-namespace scalar {
-
-// The Scalar type makes use of simple sized integers.
-template <typename T> struct Scalar {
-  static constexpr size_t kSize = sizeof(T);
-
-  static void Copy(char *__restrict dst, const char *__restrict src) {
-    Store(dst, Load(src));
-  }
-
-  static bool Equals(const char *lhs, const char *rhs) {
-    return Load(lhs) == Load(rhs);
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs) {
-    return ScalarThreeWayCompare(Load(lhs), Load(rhs));
-  }
-
-  static void SplatSet(char *dst, const unsigned char value) {
-    Store(dst, GetSplattedValue(value));
-  }
-
-private:
-  static T Load(const char *ptr) {
-    T value;
-    __builtin_memcpy_inline(&value, ptr, kSize);
-    return value;
-  }
-  static void Store(char *ptr, T value) {
-    __builtin_memcpy_inline(ptr, &value, kSize);
-  }
-  static T GetSplattedValue(const unsigned char value) {
-    return T(~0) / T(0xFF) * T(value);
-  }
-  static int ScalarThreeWayCompare(T a, T b);
-};
-
-template <>
-inline int Scalar<uint8_t>::ScalarThreeWayCompare(uint8_t a, uint8_t b) {
-  const int16_t la = Endian::ToBigEndian(a);
-  const int16_t lb = Endian::ToBigEndian(b);
-  return la - lb;
-}
-template <>
-inline int Scalar<uint16_t>::ScalarThreeWayCompare(uint16_t a, uint16_t b) {
-  const int32_t la = Endian::ToBigEndian(a);
-  const int32_t lb = Endian::ToBigEndian(b);
-  return la - lb;
-}
-template <>
-inline int Scalar<uint32_t>::ScalarThreeWayCompare(uint32_t a, uint32_t b) {
-  const int64_t la = Endian::ToBigEndian(a);
-  const int64_t lb = Endian::ToBigEndian(b);
-  if (la < lb)
-    return -1;
-  if (la > lb)
-    return 1;
-  return 0;
-}
-template <>
-inline int Scalar<uint64_t>::ScalarThreeWayCompare(uint64_t a, uint64_t b) {
-  const __int128_t la = Endian::ToBigEndian(a);
-  const __int128_t lb = Endian::ToBigEndian(b);
-  if (la < lb)
-    return -1;
-  if (la > lb)
-    return 1;
-  return 0;
-}
-
-using UINT8 = Scalar<uint8_t>;   // 1 Byte
-using UINT16 = Scalar<uint16_t>; // 2 Bytes
-using UINT32 = Scalar<uint32_t>; // 4 Bytes
-using UINT64 = Scalar<uint64_t>; // 8 Bytes
-
-using _1 = UINT8;
-using _2 = UINT16;
-using _3 = Chained<UINT16, UINT8>;
-using _4 = UINT32;
-using _8 = UINT64;
-using _16 = Repeated<_8, 2>;
-using _32 = Repeated<_8, 4>;
-using _64 = Repeated<_8, 8>;
-using _128 = Repeated<_8, 16>;
-
-} // namespace scalar
-} // namespace __llvm_libc
-
-#include <src/string/memory_utils/elements_x86.h>
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H

diff  --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h
deleted file mode 100644
index 07c178d1a4834..0000000000000
--- a/libc/src/string/memory_utils/elements_x86.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//===-- Elementary operations for x86 -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
-
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#ifdef __SSE2__
-#include <immintrin.h>
-#endif //  __SSE2__
-
-#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar
-
-// Fixed-size Vector Operations
-// ----------------------------
-
-namespace __llvm_libc {
-namespace x86 {
-
-#ifdef __SSE2__
-template <typename Base> struct Vector : public Base {
-  static void Copy(char *dst, const char *src) {
-    Base::Store(dst, Base::Load(src));
-  }
-
-  static bool Equals(const char *a, const char *b) {
-    return Base::NotEqualMask(Base::Load(a), Base::Load(b)) == 0;
-  }
-
-  static int ThreeWayCompare(const char *a, const char *b) {
-    const auto mask = Base::NotEqualMask(Base::Load(a), Base::Load(b));
-    if (!mask)
-      return 0;
-    return CharDiff(a, b, mask);
-  }
-
-  static void SplatSet(char *dst, const unsigned char value) {
-    Base::Store(dst, Base::GetSplattedValue(value));
-  }
-
-  static int CharDiff(const char *a, const char *b, uint64_t mask) {
-    const size_t 
diff _index = __builtin_ctzl(mask);
-    const int ca = (unsigned char)a[
diff _index];
-    const int cb = (unsigned char)b[
diff _index];
-    return ca - cb;
-  }
-};
-
-struct M128 {
-  static constexpr size_t kSize = 16;
-  using T = char __attribute__((__vector_size__(kSize)));
-  static uint16_t mask(T value) { return _mm_movemask_epi8(value); }
-  static uint16_t NotEqualMask(T a, T b) { return mask(a != b); }
-  static T Load(const char *ptr) {
-    return _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr));
-  }
-  static void Store(char *ptr, T value) {
-    return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr), value);
-  }
-  static T GetSplattedValue(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-
-using Vector128 = Vector<M128>; // 16 Bytes
-
-#ifdef __AVX2__
-struct M256 {
-  static constexpr size_t kSize = 32;
-  using T = char __attribute__((__vector_size__(kSize)));
-  static uint32_t mask(T value) { return _mm256_movemask_epi8(value); }
-  static uint32_t NotEqualMask(T a, T b) { return mask(a != b); }
-  static T Load(const char *ptr) {
-    return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr));
-  }
-  static void Store(char *ptr, T value) {
-    return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), value);
-  }
-  static T GetSplattedValue(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-
-using Vector256 = Vector<M256>; // 32 Bytes
-
-#if defined(__AVX512F__) and defined(__AVX512BW__)
-struct M512 {
-  static constexpr size_t kSize = 64;
-  using T = char __attribute__((__vector_size__(kSize)));
-  static uint64_t NotEqualMask(T a, T b) {
-    return _mm512_cmpneq_epi8_mask(a, b);
-  }
-  static T Load(const char *ptr) { return _mm512_loadu_epi8(ptr); }
-  static void Store(char *ptr, T value) {
-    return _mm512_storeu_epi8(ptr, value);
-  }
-  static T GetSplattedValue(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-using Vector512 = Vector<M512>;
-
-#endif // defined(__AVX512F__) and defined(__AVX512BW__)
-#endif // __AVX2__
-#endif // __SSE2__
-
-using _1 = __llvm_libc::scalar::_1;
-using _2 = __llvm_libc::scalar::_2;
-using _3 = __llvm_libc::scalar::_3;
-using _4 = __llvm_libc::scalar::_4;
-using _8 = __llvm_libc::scalar::_8;
-#if defined(__AVX512F__) && defined(__AVX512BW__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::x86::Vector256;
-using _64 = __llvm_libc::x86::Vector512;
-using _128 = __llvm_libc::Repeated<_64, 2>;
-#elif defined(__AVX2__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::x86::Vector256;
-using _64 = __llvm_libc::Repeated<_32, 2>;
-using _128 = __llvm_libc::Repeated<_32, 4>;
-#elif defined(__SSE2__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::Repeated<_16, 2>;
-using _64 = __llvm_libc::Repeated<_16, 4>;
-using _128 = __llvm_libc::Repeated<_16, 8>;
-#else
-using _16 = __llvm_libc::Repeated<_8, 2>;
-using _32 = __llvm_libc::Repeated<_8, 4>;
-using _64 = __llvm_libc::Repeated<_8, 8>;
-using _128 = __llvm_libc::Repeated<_8, 16>;
-#endif
-
-} // namespace x86
-} // namespace __llvm_libc
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H

diff  --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
new file mode 100644
index 0000000000000..23836bbde3427
--- /dev/null
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -0,0 +1,140 @@
+//===-- Memcpy utils --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H
+
+#include "src/__support/sanitizer.h"
+#include "src/string/memory_utils/utils.h"
+#include <stddef.h> // size_t
+
+// __builtin_memcpy_inline guarantees to never call external functions.
+// Unfortunately it is not widely available.
+#ifdef __clang__
+#if __has_builtin(__builtin_memcpy_inline)
+#define USE_BUILTIN_MEMCPY_INLINE
+#endif
+#elif defined(__GNUC__)
+#define USE_BUILTIN_MEMCPY
+#endif
+
+namespace __llvm_libc {
+
+// This is useful for testing.
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
+                                         const char *__restrict, size_t);
+#endif
+
+// Copies `kBlockSize` bytes from `src` to `dst` using a for loop.
+// This code requires the use of `-fno-buitin-memcpy` to prevent the compiler
+// from turning the for-loop back into `__builtin_memcpy`.
+template <size_t kBlockSize>
+static void ForLoopCopy(char *__restrict dst, const char *__restrict src) {
+  for (size_t i = 0; i < kBlockSize; ++i)
+    dst[i] = src[i];
+}
+
+// Copies `kBlockSize` bytes from `src` to `dst`.
+template <size_t kBlockSize>
+static void CopyBlock(char *__restrict dst, const char *__restrict src) {
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+  LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
+#elif LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
+  ForLoopCopy<kBlockSize>(dst, src);
+#elif defined(USE_BUILTIN_MEMCPY_INLINE)
+  __builtin_memcpy_inline(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY)
+  __builtin_memcpy(dst, src, kBlockSize);
+#else
+  ForLoopCopy<kBlockSize>(dst, src);
+#endif
+}
+
+// Copies `kBlockSize` bytes from `src + count - kBlockSize` to
+// `dst + count - kBlockSize`.
+// Precondition: `count >= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
+                          size_t count) {
+  const size_t offset = count - kBlockSize;
+  CopyBlock<kBlockSize>(dst + offset, src + offset);
+}
+
+// Copies `kBlockSize` bytes twice with an overlap between the two.
+//
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `count >= kBlockSize && count <= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src,
+                             size_t count) {
+  CopyBlock<kBlockSize>(dst, src);
+  CopyLastBlock<kBlockSize>(dst, src, count);
+}
+
+// Copies `count` bytes by blocks of `kBlockSize` bytes.
+// Copies at the start and end of the buffer are unaligned.
+// Copies in the middle of the buffer are aligned to `kAlignment`.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXX___________________________]
+// [_____XXXXXXXX____________________]
+// [_____________XXXXXXXX____________]
+// [_____________________XXXXXXXX____]
+// [______________________XXXXXXXX___]
+//
+// Precondition: `kAlignment <= kBlockSize`
+//               `count > 2 * kBlockSize` for efficiency.
+//               `count >= kAlignment` for correctness.
+template <size_t kBlockSize, size_t kAlignment = kBlockSize>
+static void CopySrcAlignedBlocks(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
+  static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
+  static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
+  static_assert(kAlignment <= kBlockSize,
+                "kAlignment must be less or equal to block size");
+  CopyBlock<kAlignment>(dst, src); // Copy first block
+
+  // Copy aligned blocks
+  const size_t ofla = offset_from_last_aligned<kAlignment>(src);
+  const size_t limit = count + ofla - kBlockSize;
+  for (size_t offset = kAlignment; offset < limit; offset += kBlockSize)
+    CopyBlock<kBlockSize>(dst - ofla + offset,
+                          assume_aligned<kAlignment>(src - ofla + offset));
+
+  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
+template <size_t kBlockSize, size_t kAlignment = kBlockSize>
+static void CopyDstAlignedBlocks(char *__restrict dst,
+                                 const char *__restrict src, size_t count) {
+  static_assert(is_power2(kAlignment), "kAlignment must be a power of two");
+  static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two");
+  static_assert(kAlignment <= kBlockSize,
+                "kAlignment must be less or equal to block size");
+  CopyBlock<kAlignment>(dst, src); // Copy first block
+
+  // Copy aligned blocks
+  const size_t ofla = offset_from_last_aligned<kAlignment>(dst);
+  const size_t limit = count + ofla - kBlockSize;
+  for (size_t offset = kAlignment; offset < limit; offset += kBlockSize)
+    CopyBlock<kBlockSize>(assume_aligned<kAlignment>(dst - ofla + offset),
+                          src - ofla + offset);
+
+  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
+} // namespace __llvm_libc
+
+#endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H

diff  --git a/libc/src/string/memory_utils/memset_utils.h b/libc/src/string/memory_utils/memset_utils.h
index 4826670444f80..7024a6c71868b 100644
--- a/libc/src/string/memory_utils/memset_utils.h
+++ b/libc/src/string/memory_utils/memset_utils.h
@@ -6,16 +6,70 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
+#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
 
-#include "src/string/memory_utils/elements.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
+// Sets `kBlockSize` bytes starting from `src` to `value`.
+template <size_t kBlockSize> static void SetBlock(char *dst, unsigned value) {
+  // Theoretically the compiler is allowed to call memset here and end up with a
+  // recursive call, practically it doesn't happen, however this should be
+  // replaced with a __builtin_memset_inline once it's available in clang.
+  __builtin_memset(dst, value, kBlockSize);
+}
+
+// Sets `kBlockSize` bytes from `src + count - kBlockSize` to `value`.
+// Precondition: `count >= kBlockSize`.
+template <size_t kBlockSize>
+static void SetLastBlock(char *dst, unsigned value, size_t count) {
+  SetBlock<kBlockSize>(dst + count - kBlockSize, value);
+}
+
+// Sets `kBlockSize` bytes twice with an overlap between the two.
+//
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `count >= kBlockSize && count <= kBlockSize`.
+template <size_t kBlockSize>
+static void SetBlockOverlap(char *dst, unsigned value, size_t count) {
+  SetBlock<kBlockSize>(dst, value);
+  SetLastBlock<kBlockSize>(dst, value, count);
+}
+
+// Sets `count` bytes by blocks of `kBlockSize` bytes.
+// Sets at the start and end of the buffer are unaligned.
+// Sets in the middle of the buffer are aligned to `kBlockSize`.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX______________________]
+// [________XXXXXXXX________________]
+// [________________XXXXXXXX________]
+// [_____________________XXXXXXXX___]
+//
+// Precondition: `count > 2 * kBlockSize` for efficiency.
+//               `count >= kBlockSize` for correctness.
+template <size_t kBlockSize>
+static void SetAlignedBlocks(char *dst, unsigned value, size_t count) {
+  SetBlock<kBlockSize>(dst, value); // Set first block
+
+  // Set aligned blocks
+  size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
+  for (; offset + kBlockSize < count; offset += kBlockSize)
+    SetBlock<kBlockSize>(dst + offset, value);
+
+  SetLastBlock<kBlockSize>(dst, value, count); // Set last block
+}
+
 // A general purpose implementation assuming cheap unaligned writes for sizes:
 // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
 // or 64 Bytes at a time, the compiler will expand them as needed.
@@ -52,27 +106,26 @@ inline static void GeneralPurposeMemset(char *dst, unsigned char value,
   if (count == 0)
     return;
   if (count == 1)
-    return SplatSet<scalar::_1>(dst, value);
+    return SetBlock<1>(dst, value);
   if (count == 2)
-    return SplatSet<scalar::_2>(dst, value);
+    return SetBlock<2>(dst, value);
   if (count == 3)
-    return SplatSet<scalar::_3>(dst, value);
+    return SetBlock<3>(dst, value);
   if (count == 4)
-    return SplatSet<scalar::_4>(dst, value);
+    return SetBlock<4>(dst, value);
   if (count <= 8)
-    return SplatSet<HeadTail<scalar::_4>>(dst, value, count);
+    return SetBlockOverlap<4>(dst, value, count);
   if (count <= 16)
-    return SplatSet<HeadTail<scalar::_8>>(dst, value, count);
+    return SetBlockOverlap<8>(dst, value, count);
   if (count <= 32)
-    return SplatSet<HeadTail<scalar::_16>>(dst, value, count);
+    return SetBlockOverlap<16>(dst, value, count);
   if (count <= 64)
-    return SplatSet<HeadTail<scalar::_32>>(dst, value, count);
+    return SetBlockOverlap<32>(dst, value, count);
   if (count <= 128)
-    return SplatSet<HeadTail<scalar::_64>>(dst, value, count);
-  return SplatSet<Align<scalar::_32, Arg::Dst>::Then<Loop<scalar::_32>>>(
-      dst, value, count);
+    return SetBlockOverlap<64>(dst, value, count);
+  return SetAlignedBlocks<32>(dst, value, count);
 }
 
 } // namespace __llvm_libc
 
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H
+#endif //  LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H

diff  --git a/libc/src/string/x86_64/memcpy.cpp b/libc/src/string/x86_64/memcpy.cpp
index 7f6e5b64b3a74..bbd8fe9ebf7a5 100644
--- a/libc/src/string/x86_64/memcpy.cpp
+++ b/libc/src/string/x86_64/memcpy.cpp
@@ -8,7 +8,7 @@
 
 #include "src/string/memcpy.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/memcpy_utils.h"
 
 namespace __llvm_libc {
 
@@ -29,11 +29,8 @@ constexpr size_t kRepMovsBSize =
 // Whether target supports AVX instructions.
 constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
 
-#ifdef __AVX__
-using LoopBlockSize = __llvm_libc::x86::_64;
-#else
-using LoopBlockSize = __llvm_libc::x86::_32;
-#endif
+// The chunk size used for the loop copy strategy.
+constexpr size_t kLoopCopyBlockSize = kHasAvx ? 64 : 32;
 
 static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
                          size_t count) {
@@ -64,37 +61,33 @@ static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
 //   with little change on the code side.
 static void memcpy_x86(char *__restrict dst, const char *__restrict src,
                        size_t count) {
-  // Use x86 strategies (_1, _2, _3 ...)
-  using namespace __llvm_libc::x86;
-
   if (kUseOnlyRepMovsb)
     return CopyRepMovsb(dst, src, count);
 
   if (count == 0)
     return;
   if (count == 1)
-    return Copy<_1>(dst, src);
+    return CopyBlock<1>(dst, src);
   if (count == 2)
-    return Copy<_2>(dst, src);
+    return CopyBlock<2>(dst, src);
   if (count == 3)
-    return Copy<_3>(dst, src);
+    return CopyBlock<3>(dst, src);
   if (count == 4)
-    return Copy<_4>(dst, src);
+    return CopyBlock<4>(dst, src);
   if (count < 8)
-    return Copy<HeadTail<_4>>(dst, src, count);
+    return CopyBlockOverlap<4>(dst, src, count);
   if (count < 16)
-    return Copy<HeadTail<_8>>(dst, src, count);
+    return CopyBlockOverlap<8>(dst, src, count);
   if (count < 32)
-    return Copy<HeadTail<_16>>(dst, src, count);
+    return CopyBlockOverlap<16>(dst, src, count);
   if (count < 64)
-    return Copy<HeadTail<_32>>(dst, src, count);
+    return CopyBlockOverlap<32>(dst, src, count);
   if (count < 128)
-    return Copy<HeadTail<_64>>(dst, src, count);
+    return CopyBlockOverlap<64>(dst, src, count);
   if (kHasAvx && count < 256)
-    return Copy<HeadTail<_128>>(dst, src, count);
+    return CopyBlockOverlap<128>(dst, src, count);
   if (count <= kRepMovsBSize)
-    return Copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
-                                                                 count);
+    return CopyDstAlignedBlocks<kLoopCopyBlockSize, 32>(dst, src, count);
   return CopyRepMovsb(dst, src, count);
 }
 

diff  --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index e2709720b8182..068a9ec18035c 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -3,14 +3,11 @@ add_libc_unittest(
   SUITE
     libc_string_unittests
   SRCS
-    elements_test.cpp
-    memory_access_test.cpp
     utils_test.cpp
+    memcpy_utils_test.cpp
   DEPENDS
     libc.src.string.memory_utils.memory_utils
     libc.utils.CPP.standalone_cpp
-  COMPILE_OPTIONS
-    ${LIBC_COMPILE_OPTIONS_NATIVE}
 )
 
 target_compile_definitions(

diff  --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp
deleted file mode 100644
index 120cd6ff8761e..0000000000000
--- a/libc/test/src/string/memory_utils/elements_test.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-//===-- Unittests for memory_utils ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/string/memory_utils/elements.h"
-#include "utils/CPP/Array.h"
-#include "utils/UnitTest/Test.h"
-
-namespace __llvm_libc {
-
-// Registering Types
-using FixedSizeTypes = testing::TypeList<
-#ifdef __SSE2__
-    x86::Vector128, //
-#endif              // __SSE2__
-#ifdef __AVX2__
-    x86::Vector256, //
-#endif              // __AVX2__
-#if defined(__AVX512F__) and defined(__AVX512BW__)
-    x86::Vector512, //
-#endif              // defined(__AVX512F__) and defined(__AVX512BW__)
-    scalar::UINT8,  //
-    scalar::UINT16, //
-    scalar::UINT32, //
-    scalar::UINT64, //
-    Repeated<scalar::UINT64, 2>,                            //
-    Repeated<scalar::UINT64, 4>,                            //
-    Repeated<scalar::UINT64, 8>,                            //
-    Repeated<scalar::UINT64, 16>,                           //
-    Repeated<scalar::UINT64, 32>,                           //
-    Chained<scalar::UINT16, scalar::UINT8>,                 //
-    Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, //
-    builtin::_1,                                            //
-    builtin::_2,                                            //
-    builtin::_3,                                            //
-    builtin::_4,                                            //
-    builtin::_8                                             //
-    >;
-
-char GetRandomChar() {
-  static constexpr const uint64_t a = 1103515245;
-  static constexpr const uint64_t c = 12345;
-  static constexpr const uint64_t m = 1ULL << 31;
-  static uint64_t seed = 123456789;
-  seed = (a * seed + c) % m;
-  return seed;
-}
-
-template <typename Element> using Buffer = cpp::Array<char, Element::kSize>;
-template <typename Element> Buffer<Element> GetRandomBuffer() {
-  Buffer<Element> buffer;
-  for (auto &current : buffer)
-    current = GetRandomChar();
-  return buffer;
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Copy, FixedSizeTypes) {
-  Buffer<ParamType> Dst;
-  const auto buffer = GetRandomBuffer<ParamType>();
-  Copy<ParamType>(Dst.data(), buffer.data());
-  for (size_t i = 0; i < ParamType::kSize; ++i)
-    EXPECT_EQ(Dst[i], buffer[i]);
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) {
-  const auto buffer = GetRandomBuffer<ParamType>();
-  EXPECT_TRUE(Equals<ParamType>(buffer.data(), buffer.data()));
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, ThreeWayCompare, FixedSizeTypes) {
-  Buffer<ParamType> initial;
-  for (auto &c : initial)
-    c = 5;
-
-  // Testing equality
-  EXPECT_EQ(ThreeWayCompare<ParamType>(initial.data(), initial.data()), 0);
-
-  // Testing all mismatching positions
-  for (size_t i = 0; i < ParamType::kSize; ++i) {
-    auto copy = initial;
-    ++copy[i]; // Copy is now lexicographycally greated than initial
-    const auto *less = initial.data();
-    const auto *greater = copy.data();
-    EXPECT_LT(ThreeWayCompare<ParamType>(less, greater), 0);
-    EXPECT_GT(ThreeWayCompare<ParamType>(greater, less), 0);
-  }
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) {
-  Buffer<ParamType> Dst;
-  const cpp::Array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)};
-  for (char value : values) {
-    SplatSet<ParamType>(Dst.data(), value);
-    for (size_t i = 0; i < ParamType::kSize; ++i)
-      EXPECT_EQ(Dst[i], value);
-  }
-}
-
-} // namespace __llvm_libc

diff  --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
new file mode 100644
index 0000000000000..37529ae3d8ada
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -0,0 +1,336 @@
+//===-- Unittests for memory_utils ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memory_utils/memcpy_utils.h"
+#include "utils/CPP/Array.h"
+#include "utils/UnitTest/Test.h"
+
+#include <assert.h>
+#include <stdint.h> // uintptr_t
+
+#ifndef LLVM_LIBC_MEMCPY_MONITOR
+#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test.
+#endif
+
+namespace __llvm_libc {
+
+struct Buffer {
+  static constexpr size_t kMaxBuffer = 1024;
+  char buffer[kMaxBuffer + 1];
+  size_t last = 0;
+
+  void Clear() {
+    last = 0;
+    for (size_t i = 0; i < kMaxBuffer; ++i)
+      buffer[i] = '0';
+    buffer[kMaxBuffer] = '\0';
+  }
+
+  void Increment(const void *ptr) {
+    const auto offset = reinterpret_cast<uintptr_t>(ptr);
+    assert(offset < kMaxBuffer);
+    ++buffer[offset];
+    if (offset > last)
+      last = offset;
+  }
+
+  char *Finish() {
+    assert(last < kMaxBuffer);
+    buffer[last + 1] = '\0';
+    return buffer;
+  }
+};
+
+struct Trace {
+  Buffer read;
+  Buffer write;
+
+  void Add(char *__restrict dst, const char *__restrict src, size_t count) {
+    for (size_t i = 0; i < count; ++i)
+      read.Increment(src + i);
+    for (size_t i = 0; i < count; ++i)
+      write.Increment(dst + i);
+  }
+
+  void Clear() {
+    read.Clear();
+    write.Clear();
+  }
+
+  char *Read() { return read.Finish(); }
+  char *Write() { return write.Finish(); }
+};
+
+static Trace &GetTrace() {
+  static thread_local Trace events;
+  return events;
+}
+
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst,
+                                         const char *__restrict src,
+                                         size_t count) {
+  GetTrace().Add(dst, src, count);
+}
+
+char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); }
+
+TEST(LlvmLibcMemcpyUtilsTest, CopyTrivial) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  CopyBlock<1>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1");
+  EXPECT_STREQ(trace.Read(), "1");
+
+  trace.Clear();
+  CopyBlock<2>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11");
+  EXPECT_STREQ(trace.Read(), "11");
+
+  trace.Clear();
+  CopyBlock<4>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  CopyBlock<8>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  trace.Clear();
+  CopyBlock<16>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111111111111111");
+  EXPECT_STREQ(trace.Read(), "1111111111111111");
+
+  trace.Clear();
+  CopyBlock<32>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
+  EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
+
+  trace.Clear();
+  CopyBlock<64>(I(0), I(0));
+  EXPECT_STREQ(
+      trace.Write(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+  EXPECT_STREQ(
+      trace.Read(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+}
+
+TEST(LlvmLibcMemcpyUtilsTest, CopyOffset) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  CopyBlock<1>(I(3), I(1));
+  EXPECT_STREQ(trace.Write(), "0001");
+  EXPECT_STREQ(trace.Read(), "01");
+
+  trace.Clear();
+  CopyBlock<1>(I(2), I(1));
+  EXPECT_STREQ(trace.Write(), "001");
+  EXPECT_STREQ(trace.Read(), "01");
+}
+
+TEST(LlvmLibcMemcpyUtilsTest, CopyBlockOverlap) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  CopyBlockOverlap<2>(I(0), I(0), 2);
+  EXPECT_STREQ(trace.Write(), "22");
+  EXPECT_STREQ(trace.Read(), "22");
+
+  trace.Clear();
+  CopyBlockOverlap<2>(I(0), I(0), 3);
+  EXPECT_STREQ(trace.Write(), "121");
+  EXPECT_STREQ(trace.Read(), "121");
+
+  trace.Clear();
+  CopyBlockOverlap<2>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  CopyBlockOverlap<4>(I(2), I(1), 7);
+  EXPECT_STREQ(trace.Write(), "001112111");
+  EXPECT_STREQ(trace.Read(), "01112111");
+}
+
+TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) {
+  auto &trace = GetTrace();
+  // Source is aligned and multiple of alignment.
+  //   "1111"
+  trace.Clear();
+  CopySrcAlignedBlocks<4>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "2222");
+  EXPECT_STREQ(trace.Read(), "2222");
+
+  // Source is aligned and multiple of alignment.
+  //   "11110000"
+  // + "00001111"
+  // = "11111111"
+  trace.Clear();
+  CopySrcAlignedBlocks<4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  // Source is aligned already overlap at end.
+  //   "1111000000000"
+  // + "0000111100000"
+  // + "0000000011110"
+  // + "0000000001111"
+  // = "1111111112221"
+  trace.Clear();
+  CopySrcAlignedBlocks<4>(I(0), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "1111111112221");
+  EXPECT_STREQ(trace.Read(), "1111111112221");
+
+  // Misaligned source.
+  //   "01111000000000"
+  // + "00001111000000"
+  // + "00000000111100"
+  // + "00000000001111"
+  // = "01112111112211"
+  trace.Clear();
+  CopySrcAlignedBlocks<4>(I(0), I(1), 13);
+  EXPECT_STREQ(trace.Write(), "1112111112211");
+  EXPECT_STREQ(trace.Read(), "01112111112211");
+
+  // Misaligned source aligned at end.
+  //   "011110000000"
+  // + "000011110000"
+  // + "000000001111"
+  // = "011121111111"
+  trace.Clear();
+  CopySrcAlignedBlocks<4>(I(0), I(1), 11);
+  EXPECT_STREQ(trace.Write(), "11121111111");
+  EXPECT_STREQ(trace.Read(), "011121111111");
+}
+
+TEST(LlvmLibcMemcpyUtilsTest, CopyDstAlignedBlocks) {
+  auto &trace = GetTrace();
+  // Destination is aligned and multiple of alignment.
+  //   "1111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "2222");
+  EXPECT_STREQ(trace.Read(), "2222");
+
+  // Destination is aligned and multiple of alignment.
+  //   "11110000"
+  // + "00001111"
+  // = "11111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  // Destination is aligned already overlap at end.
+  //   "1111000000000"
+  // + "0000111100000"
+  // + "0000000011110"
+  // + "0000000001111"
+  // = "1111111112221"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(0), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "1111111112221");
+  EXPECT_STREQ(trace.Read(), "1111111112221");
+
+  // Misaligned destination.
+  //   "01111000000000"
+  // + "00001111000000"
+  // + "00000000111100"
+  // + "00000000001111"
+  // = "01112111112211"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(1), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "01112111112211");
+  EXPECT_STREQ(trace.Read(), "1112111112211");
+
+  // Misaligned destination aligned at end.
+  //   "011110000000"
+  // + "000011110000"
+  // + "000000001111"
+  // = "011121111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<4>(I(1), I(0), 11);
+  EXPECT_STREQ(trace.Write(), "011121111111");
+  EXPECT_STREQ(trace.Read(), "11121111111");
+}
+
+TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignment) {
+  auto &trace = GetTrace();
+  // Source is aligned and multiple of alignment.
+  //   "11111111"
+  trace.Clear();
+  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "22221111");
+  EXPECT_STREQ(trace.Read(), "22221111");
+
+  // Destination is aligned and multiple of alignment.
+  //   "11111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 8);
+  EXPECT_STREQ(trace.Write(), "22221111");
+  EXPECT_STREQ(trace.Read(), "22221111");
+
+  // Source is aligned and multiple of alignment.
+  //   "111111111"
+  trace.Clear();
+  CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9);
+  EXPECT_STREQ(trace.Write(), "122211111");
+  EXPECT_STREQ(trace.Read(), "122211111");
+
+  // Destination is aligned and multiple of alignment.
+  //   "111111111"
+  trace.Clear();
+  CopyDstAlignedBlocks<8, 4>(I(0), I(0), 9);
+  EXPECT_STREQ(trace.Write(), "122211111");
+  EXPECT_STREQ(trace.Read(), "122211111");
+}
+
+TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksMaxReloads) {
+  auto &trace = GetTrace();
+  for (size_t alignment = 0; alignment < 32; ++alignment) {
+    for (size_t count = 64; count < 768; ++count) {
+      trace.Clear();
+      // We should never reload more than twice when copying from count = 2x32.
+      CopySrcAlignedBlocks<32>(I(alignment), I(0), count);
+      const char *const written = trace.Write();
+      // First bytes are untouched.
+      for (size_t i = 0; i < alignment; ++i)
+        EXPECT_EQ(written[i], '0');
+      // Next bytes are loaded once or twice but no more.
+      for (size_t i = alignment; i < count; ++i) {
+        EXPECT_GE(written[i], '1');
+        EXPECT_LE(written[i], '2');
+      }
+    }
+  }
+}
+
+TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignmentMaxReloads) {
+  auto &trace = GetTrace();
+  for (size_t alignment = 0; alignment < 32; ++alignment) {
+    for (size_t count = 64; count < 768; ++count) {
+      trace.Clear();
+      // We should never reload more than twice when copying from count = 2x32.
+      CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count);
+      const char *const written = trace.Write();
+      // First bytes are untouched.
+      for (size_t i = 0; i < alignment; ++i)
+        EXPECT_EQ(written[i], '0');
+      // Next bytes are loaded once or twice but no more.
+      for (size_t i = alignment; i < count; ++i) {
+        EXPECT_GE(written[i], '1');
+        EXPECT_LE(written[i], '2');
+      }
+    }
+  }
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp
deleted file mode 100644
index 690a0a5360e90..0000000000000
--- a/libc/test/src/string/memory_utils/memory_access_test.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-//===-- Unittests for memory_utils ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define LLVM_LIBC_UNITTEST_OBSERVE 1
-
-#include "src/string/memory_utils/elements.h"
-#include "utils/CPP/Array.h"
-#include "utils/CPP/ArrayRef.h"
-#include "utils/UnitTest/Test.h"
-
-#include <stdio.h>
-#include <string.h>
-
-namespace __llvm_libc {
-
-static constexpr const size_t kMaxBuffer = 32;
-
-struct BufferAccess : cpp::Array<char, kMaxBuffer + 1> {
-  BufferAccess() { Reset(); }
-  void Reset() {
-    for (auto &value : *this)
-      value = '0';
-    this->operator[](kMaxBuffer) = '\0';
-  }
-  void Touch(ptr
diff _t offset, size_t size) {
-    if (offset < 0)
-      return;
-    for (size_t i = 0; i < size; ++i)
-      ++(*this)[offset + i];
-  }
-  operator const char *() const { return this->data(); }
-};
-
-struct Buffer {
-  ptr
diff _t Offset(const char *ptr) const {
-    const bool contained = ptr >= data.begin() && ptr < data.end();
-    return contained ? ptr - data.begin() : -1;
-  }
-  void Reset() {
-    reads.Reset();
-    writes.Reset();
-  }
-  cpp::Array<char, kMaxBuffer> data;
-  BufferAccess __attribute__((aligned(64))) reads;
-  BufferAccess __attribute__((aligned(64))) writes;
-};
-
-struct MemoryAccessObserver {
-  void ObserveRead(const char *ptr, size_t size) {
-    Buffer1.reads.Touch(Buffer1.Offset(ptr), size);
-    Buffer2.reads.Touch(Buffer2.Offset(ptr), size);
-  }
-
-  void ObserveWrite(const char *ptr, size_t size) {
-    Buffer1.writes.Touch(Buffer1.Offset(ptr), size);
-    Buffer2.writes.Touch(Buffer2.Offset(ptr), size);
-  }
-
-  void Reset() {
-    Buffer1.Reset();
-    Buffer2.Reset();
-  }
-
-  Buffer Buffer1;
-  Buffer Buffer2;
-};
-
-MemoryAccessObserver Observer;
-
-template <size_t Size> struct TestingElement {
-  static constexpr size_t kSize = Size;
-
-  static void Copy(char *__restrict dst, const char *__restrict src) {
-    Observer.ObserveRead(src, kSize);
-    Observer.ObserveWrite(dst, kSize);
-  }
-
-  static bool Equals(const char *lhs, const char *rhs) {
-    Observer.ObserveRead(lhs, kSize);
-    Observer.ObserveRead(rhs, kSize);
-    return true;
-  }
-
-  static int ThreeWayCompare(const char *lhs, const char *rhs) {
-    Observer.ObserveRead(lhs, kSize);
-    Observer.ObserveRead(rhs, kSize);
-    return 0;
-  }
-
-  static void SplatSet(char *dst, const unsigned char value) {
-    Observer.ObserveWrite(dst, kSize);
-  }
-};
-
-using Types = testing::TypeList<
-    TestingElement<1>,                                               // 1 Byte
-    TestingElement<2>,                                               // 2 Bytes
-    TestingElement<4>,                                               // 4 Bytes
-    Repeated<TestingElement<2>, 3>,                                  // 6 Bytes
-    Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes
-    >;
-
-struct LlvmLibcTestAccessBase : public testing::Test {
-
-  template <typename HigherOrder, size_t Size, size_t Offset = 0>
-  void checkOperations(const BufferAccess &expected) {
-    static const BufferAccess untouched;
-
-    Observer.Reset();
-    HigherOrder::Copy(dst_ptr() + Offset, src_ptr() + Offset, Size);
-    ASSERT_STREQ(src().writes, untouched);
-    ASSERT_STREQ(dst().reads, untouched);
-    ASSERT_STREQ(src().reads, expected);
-    ASSERT_STREQ(dst().writes, expected);
-    Observer.Reset();
-    HigherOrder::Equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
-    ASSERT_STREQ(lhs().writes, untouched);
-    ASSERT_STREQ(rhs().writes, untouched);
-    ASSERT_STREQ(lhs().reads, expected);
-    ASSERT_STREQ(rhs().reads, expected);
-    Observer.Reset();
-    HigherOrder::ThreeWayCompare(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
-    ASSERT_STREQ(lhs().writes, untouched);
-    ASSERT_STREQ(rhs().writes, untouched);
-    ASSERT_STREQ(lhs().reads, expected);
-    ASSERT_STREQ(rhs().reads, expected);
-    Observer.Reset();
-    HigherOrder::SplatSet(dst_ptr() + Offset, 5, Size);
-    ASSERT_STREQ(src().reads, untouched);
-    ASSERT_STREQ(src().writes, untouched);
-    ASSERT_STREQ(dst().reads, untouched);
-    ASSERT_STREQ(dst().writes, expected);
-  }
-
-  void checkMaxAccess(const BufferAccess &expected, int max) {
-    for (size_t i = 0; i < kMaxBuffer; ++i) {
-      int value = (int)expected[i] - '0';
-      if (value < 0 || value > max) {
-        printf("expected no more than %d access, was '%s'\n", max,
-               (const char *)expected);
-        ASSERT_LE(value, max);
-      }
-    }
-  }
-
-private:
-  const Buffer &lhs() const { return Observer.Buffer1; }
-  const Buffer &rhs() const { return Observer.Buffer2; }
-  const Buffer &src() const { return Observer.Buffer2; }
-  const Buffer &dst() const { return Observer.Buffer1; }
-  Buffer &dst() { return Observer.Buffer1; }
-
-  char *dst_ptr() { return dst().data.begin(); }
-  const char *src_ptr() { return src().data.begin(); }
-  const char *lhs_ptr() { return lhs().data.begin(); }
-  const char *rhs_ptr() { return rhs().data.begin(); }
-};
-
-template <typename ParamType>
-struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase {
-
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-
-    BufferAccess expected;
-    expected.Touch(Size - ParamType::kSize, ParamType::kSize);
-
-    checkMaxAccess(expected, 1);
-    checkOperations<Tail<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-
-    BufferAccess expected;
-    expected.Touch(0, ParamType::kSize);
-    expected.Touch(Size - ParamType::kSize, ParamType::kSize);
-
-    checkMaxAccess(expected, 2);
-    checkOperations<HeadTail<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 20;
-
-    BufferAccess expected;
-    for (size_t i = 0; i < Size - ParamType::kSize; i += ParamType::kSize)
-      expected.Touch(i, ParamType::kSize);
-    expected.Touch(Size - ParamType::kSize, ParamType::kSize);
-
-    checkMaxAccess(expected, 2);
-    checkOperations<Loop<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-    static constexpr size_t Offset = 2;
-    using AlignmentT = TestingElement<4>;
-
-    BufferAccess expected;
-    expected.Touch(Offset, AlignmentT::kSize);
-    expected.Touch(AlignmentT::kSize, ParamType::kSize);
-    expected.Touch(Offset + Size - ParamType::kSize, ParamType::kSize);
-
-    checkMaxAccess(expected, 3);
-    checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size,
-                    Offset>(expected);
-    checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size,
-                    Offset>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {}
-
-} // namespace __llvm_libc


        


More information about the libc-commits mailing list