[libc-commits] [libc] 4fed4b0 - Revert "[libc] New version of the mem* framework"
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Fri Oct 14 05:27:22 PDT 2022
Author: Guillaume Chatelet
Date: 2022-10-14T12:27:04Z
New Revision: 4fed4b094334e0978a5d4bbe36fae3eba4a18448
URL: https://github.com/llvm/llvm-project/commit/4fed4b094334e0978a5d4bbe36fae3eba4a18448
DIFF: https://github.com/llvm/llvm-project/commit/4fed4b094334e0978a5d4bbe36fae3eba4a18448.diff
LOG: Revert "[libc] New version of the mem* framework"
This reverts commit 98bf836f3127a346a81da5ae3e27246935298de4.
Added:
libc/src/string/memory_utils/elements.h
libc/src/string/memory_utils/elements_aarch64.h
libc/src/string/memory_utils/elements_x86.h
libc/test/src/string/memory_utils/elements_test.cpp
libc/test/src/string/memory_utils/memory_access_test.cpp
Modified:
libc/src/stdio/printf_core/string_writer.cpp
libc/src/string/bcmp.cpp
libc/src/string/memcmp.cpp
libc/src/string/memmove.cpp
libc/src/string/memory_utils/CMakeLists.txt
libc/src/string/memory_utils/bcmp_implementations.h
libc/src/string/memory_utils/memcmp_implementations.h
libc/src/string/memory_utils/memcpy_implementations.h
libc/src/string/memory_utils/memset_implementations.h
libc/src/string/memory_utils/utils.h
libc/src/string/memset.cpp
libc/test/src/string/bcmp_test.cpp
libc/test/src/string/memmove_test.cpp
libc/test/src/string/memory_utils/CMakeLists.txt
libc/test/src/string/memory_utils/utils_test.cpp
utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Removed:
libc/src/string/memory_utils/README.md
libc/src/string/memory_utils/op_aarch64.h
libc/src/string/memory_utils/op_builtin.h
libc/src/string/memory_utils/op_generic.h
libc/src/string/memory_utils/op_x86.h
################################################################################
diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp
index 472573d4a8137..a80df32d40a02 100644
--- a/libc/src/stdio/printf_core/string_writer.cpp
+++ b/libc/src/stdio/printf_core/string_writer.cpp
@@ -33,7 +33,7 @@ void StringWriter::write(char new_char, size_t len) {
len = available_capacity;
if (len > 0) {
- inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
+ inline_memset(cur_buffer, new_char, len);
cur_buffer += len;
available_capacity -= len;
}
diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp
index fb007802a8d1d..963a7f5bce17c 100644
--- a/libc/src/string/bcmp.cpp
+++ b/libc/src/string/bcmp.cpp
@@ -14,8 +14,8 @@ namespace __llvm_libc {
LLVM_LIBC_FUNCTION(int, bcmp,
(const void *lhs, const void *rhs, size_t count)) {
- return static_cast<int>(inline_bcmp(static_cast<const char *>(lhs),
- static_cast<const char *>(rhs), count));
+ return inline_bcmp(static_cast<const char *>(lhs),
+ static_cast<const char *>(rhs), count);
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp
index 357b57d1dbeaa..292525e17dad0 100644
--- a/libc/src/string/memcmp.cpp
+++ b/libc/src/string/memcmp.cpp
@@ -15,8 +15,8 @@ namespace __llvm_libc {
LLVM_LIBC_FUNCTION(int, memcmp,
(const void *lhs, const void *rhs, size_t count)) {
- return static_cast<int>(inline_memcmp(static_cast<const char *>(lhs),
- static_cast<const char *>(rhs), count));
+ return inline_memcmp(static_cast<const char *>(lhs),
+ static_cast<const char *>(rhs), count);
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index 67f7c84868519..f24257893b20c 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -9,103 +9,36 @@
#include "src/string/memmove.h"
#include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/__support/integer_operations.h"
+#include "src/string/memory_utils/elements.h"
#include <stddef.h> // size_t, ptr
diff _t
-#include <stdio.h>
-
namespace __llvm_libc {
-static inline void inline_memmove_embedded_tiny(Ptr dst, CPtr src,
- size_t count) {
- if ((count == 0) || (dst == src))
- return;
- if (dst < src) {
-#pragma nounroll
- for (size_t offset = 0; offset < count; ++offset)
- builtin::Memcpy<1>::block(dst + offset, src + offset);
- } else {
-#pragma nounroll
- for (ptr
diff _t offset = count; offset >= 0; --offset)
- builtin::Memcpy<1>::block(dst + offset, src + offset);
- }
-}
-
-template <size_t MaxSize>
-static inline void inline_memmove_generic(Ptr dst, CPtr src, size_t count) {
+static inline void inline_memmove(char *dst, const char *src, size_t count) {
+ using namespace __llvm_libc::scalar;
if (count == 0)
return;
if (count == 1)
- return generic::Memmove<1, MaxSize>::block(dst, src);
+ return move<_1>(dst, src);
if (count <= 4)
- return generic::Memmove<2, MaxSize>::head_tail(dst, src, count);
+ return move<HeadTail<_2>>(dst, src, count);
if (count <= 8)
- return generic::Memmove<4, MaxSize>::head_tail(dst, src, count);
+ return move<HeadTail<_4>>(dst, src, count);
if (count <= 16)
- return generic::Memmove<8, MaxSize>::head_tail(dst, src, count);
+ return move<HeadTail<_8>>(dst, src, count);
if (count <= 32)
- return generic::Memmove<16, MaxSize>::head_tail(dst, src, count);
+ return move<HeadTail<_16>>(dst, src, count);
if (count <= 64)
- return generic::Memmove<32, MaxSize>::head_tail(dst, src, count);
+ return move<HeadTail<_32>>(dst, src, count);
if (count <= 128)
- return generic::Memmove<64, MaxSize>::head_tail(dst, src, count);
- if (dst < src) {
- generic::Memmove<32, MaxSize>::template align_forward<Arg::Src>(dst, src,
- count);
- return generic::Memmove<64, MaxSize>::loop_and_tail_forward(dst, src,
- count);
- } else {
- generic::Memmove<32, MaxSize>::template align_backward<Arg::Src>(dst, src,
- count);
- return generic::Memmove<64, MaxSize>::loop_and_tail_backward(dst, src,
- count);
- }
-}
+ return move<HeadTail<_64>>(dst, src, count);
-static inline void inline_memmove(Ptr dst, CPtr src, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-#if defined(LLVM_LIBC_ARCH_X86)
- static constexpr size_t kMaxSize = x86::kAvx512F ? 64
- : x86::kAvx ? 32
- : x86::kSse2 ? 16
- : 8;
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
-#endif
- // return inline_memmove_generic<kMaxSize>(dst, src, count);
- if (count == 0)
- return;
- if (count == 1)
- return generic::Memmove<1, kMaxSize>::block(dst, src);
- if (count <= 4)
- return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
- if (count <= 8)
- return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
- if (count <= 16)
- return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
- if (count <= 32)
- return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
- if (count <= 64)
- return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
- if (count <= 128)
- return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
- if (dst < src) {
- generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
- return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
- count);
- } else {
- generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
- return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
- count);
- }
-#elif defined(LLVM_LIBC_ARCH_ARM)
- return inline_memmove_embedded_tiny(dst, src, count);
-#else
-#error "Unsupported platform"
-#endif
+ using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
+ if (dst < src)
+ return move<AlignedMoveLoop>(dst, src, count);
+ else if (dst > src)
+ return move_backward<AlignedMoveLoop>(dst, src, count);
}
LLVM_LIBC_FUNCTION(void *, memmove,
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 630b0d4432426..d735fcfe54174 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -2,17 +2,13 @@
add_header_library(
memory_utils
HDRS
+ utils.h
+ elements.h
bcmp_implementations.h
bzero_implementations.h
memcmp_implementations.h
memcpy_implementations.h
memset_implementations.h
- op_aarch64.h
- op_higher_order.h
- op_builtin.h
- op_generic.h
- op_x86.h
- utils.h
DEPS
libc.src.__support.CPP.bit
)
diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md
deleted file mode 100644
index 83a2906675f69..0000000000000
--- a/libc/src/string/memory_utils/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-# The mem* framework
-
-The framework handles the following mem* functions:
- - `memcpy`
- - `memmove`
- - `memset`
- - `bzero`
- - `bcmp`
- - `memcmp`
-
-## Building blocks
-
-These functions can be built out of a set of lower-level operations:
- - **`block`** : operates on a block of `SIZE` bytes.
- - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
- - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
- - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
-
-As an illustration, let's take the example of a trivial `memset` implementation:
-
- ```C++
- extern "C" void memset(const char* dst, int value, size_t count) {
- if (count == 0) return;
- if (count == 1) return Memset<1>::block(dst, value);
- if (count == 2) return Memset<2>::block(dst, value);
- if (count == 3) return Memset<3>::block(dst, value);
- if (count <= 8) return Memset<4>::head_tail(dst, value, count); // Note that 0 to 4 bytes are written twice.
- if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
- return Memset<16>::loop_and_tail(dst, value, count);
-}
- ```
-
-Now let's have a look into the `Memset` structure:
-
-```C++
-template <size_t Size>
-struct Memset {
- static constexpr size_t SIZE = Size;
-
- static inline void block(Ptr dst, uint8_t value) {
- // Implement me
- }
-
- static inline void tail(Ptr dst, uint8_t value, size_t count) {
- block(dst + count - SIZE, value);
- }
-
- static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
- block(dst, value);
- tail(dst, value, count);
- }
-
- static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
- size_t offset = 0;
- do {
- block(dst + offset, value);
- offset += SIZE;
- } while (offset < count - SIZE);
- tail(dst, value, count);
- }
-};
-```
-
-As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
-In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
-**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
-
-## Scoped specializations
-
-We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
-
-In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
-```C++
- extern "C" void memset(const char* dst, int value, size_t count) {
- if (count == 0) return;
- if (count == 1) return generic::Memset<1>::block(dst, value);
- if (count == 2) return generic::Memset<2>::block(dst, value);
- if (count == 3) return generic::Memset<3>::block(dst, value);
- if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
- if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
- return x86::Memset<16>::loop_and_tail(dst, value, count);
-}
-```
-
-### The `builtin` scope
-
-Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
-
-### The `generic` scope
-
-In this scope we define pure C++ implementations using native integral types and clang vector extensions.
-
-### The arch specific scopes
-
-Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
-
-The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h
index 209ed6d537be8..c26e38e51adf1 100644
--- a/libc/src/string/memory_utils/bcmp_implementations.h
+++ b/libc/src/string/memory_utils/bcmp_implementations.h
@@ -11,163 +11,49 @@
#include "src/__support/architectures.h"
#include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
#include <stddef.h> // size_t
namespace __llvm_libc {
-static inline BcmpReturnType inline_bcmp_embedded_tiny(CPtr p1, CPtr p2,
- size_t count) {
-#pragma nounroll
- for (size_t offset = 0; offset < count; ++offset)
- if (auto value = generic::Bcmp<1>::block(p1, p2))
- return value;
- return BcmpReturnType::ZERO();
+// Fixed-size
diff erence between 'lhs' and 'rhs'.
+template <typename Element> bool
diff ers(const char *lhs, const char *rhs) {
+ return !Element::equals(lhs, rhs);
}
-
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-static inline BcmpReturnType inline_bcmp_generic_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (count < 256)
- return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
- if (auto value = generic::Bcmp<64>::block(p1, p2))
- return value;
- align_to_next_boundary<64, Arg::P1>(p1, p2, count);
- return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
+// Runtime-size
diff erence between 'lhs' and 'rhs'.
+template <typename Element>
+bool
diff ers(const char *lhs, const char *rhs, size_t size) {
+ return !Element::equals(lhs, rhs, size);
}
-#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
-static inline BcmpReturnType inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (count <= 32)
- return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
- if (count < 256)
- return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
- if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
- return value;
- align_to_next_boundary<16, Arg::P1>(p1, p2, count);
- return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-static inline BcmpReturnType inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (count <= 32)
- return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
- if (count <= 64)
- return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
- if (count <= 128)
- return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
- if (unlikely(count >= 256)) {
- if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
- return value;
- align_to_next_boundary<64, Arg::P1>(p1, p2, count);
- }
- return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-static inline BcmpReturnType inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (count <= 32)
- return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
- if (count <= 64)
- return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
- if (count <= 128)
- return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
- if (unlikely(count >= 256)) {
- if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
- return value;
- align_to_next_boundary<64, Arg::P1>(p1, p2, count);
- }
- return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
-}
-
-static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2, size_t count) {
+ using namespace ::__llvm_libc::x86;
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ using namespace ::__llvm_libc::aarch64;
+#else
+ using namespace ::__llvm_libc::scalar;
+#endif
if (count == 0)
- return BcmpReturnType::ZERO();
+ return 0;
if (count == 1)
- return generic::Bcmp<1>::block(p1, p2);
+ return
diff ers<_1>(lhs, rhs);
if (count == 2)
- return generic::Bcmp<2>::block(p1, p2);
- if (count <= 4)
- return generic::Bcmp<2>::head_tail(p1, p2, count);
+ return
diff ers<_2>(lhs, rhs);
+ if (count == 3)
+ return
diff ers<_3>(lhs, rhs);
if (count <= 8)
- return generic::Bcmp<4>::head_tail(p1, p2, count);
+ return
diff ers<HeadTail<_4>>(lhs, rhs, count);
if (count <= 16)
- return generic::Bcmp<8>::head_tail(p1, p2, count);
- if constexpr (x86::kAvx512BW)
- return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
- else if constexpr (x86::kAvx2)
- return inline_bcmp_x86_avx2_gt16(p1, p2, count);
- else if constexpr (x86::kSse2)
- return inline_bcmp_x86_sse2_gt16(p1, p2, count);
- else
- return inline_bcmp_generic_gt16(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-static inline BcmpReturnType inline_bcmp_aarch64(CPtr p1, CPtr p2,
- size_t count) {
- if (likely(count <= 32)) {
- if (unlikely(count >= 16)) {
- return generic::Bcmp<16>::head_tail(p1, p2, count);
- }
- switch (count) {
- case 0:
- return BcmpReturnType::ZERO();
- case 1:
- return generic::Bcmp<1>::block(p1, p2);
- case 2:
- return generic::Bcmp<2>::block(p1, p2);
- case 3:
- return generic::Bcmp<2>::head_tail(p1, p2, count);
- case 4:
- return generic::Bcmp<4>::block(p1, p2);
- case 5:
- case 6:
- case 7:
- return generic::Bcmp<4>::head_tail(p1, p2, count);
- case 8:
- return generic::Bcmp<8>::block(p1, p2);
- case 9:
- case 10:
- case 11:
- case 12:
- case 13:
- case 14:
- case 15:
- return generic::Bcmp<8>::head_tail(p1, p2, count);
- }
- }
-
+ return
diff ers<HeadTail<_8>>(lhs, rhs, count);
+ if (count <= 32)
+ return
diff ers<HeadTail<_16>>(lhs, rhs, count);
if (count <= 64)
- return generic::Bcmp<32>::head_tail(p1, p2, count);
-
- // Aligned loop if > 256, otherwise normal loop
- if (count > 256) {
- if (auto value = generic::Bcmp<32>::block(p1, p2))
- return value;
- align_to_next_boundary<16, Arg::P1>(p1, p2, count);
- }
- return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86)
- return inline_bcmp_x86(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- return inline_bcmp_aarch64(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
- return inline_bcmp_embedded_tiny(p1, p2, count);
-#else
-#error "Unsupported platform"
-#endif
+ return
diff ers<HeadTail<_32>>(lhs, rhs, count);
+ if (count <= 128)
+ return
diff ers<HeadTail<_64>>(lhs, rhs, count);
+ return
diff ers<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
new file mode 100644
index 0000000000000..f5a38308d5273
--- /dev/null
+++ b/libc/src/string/memory_utils/elements.h
@@ -0,0 +1,774 @@
+//===-- Elementary operations to compose memory primitives ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
+
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#include "src/__support/endian.h"
+#include "src/string/memory_utils/utils.h"
+
+namespace __llvm_libc {
+
+// Elementary Operations
+// --------------------------------
+// We define abstract elementary operations acting on fixed chunks of memory.
+// These are low level building blocks that are meant to be assembled to compose
+// higher order abstractions. Each function is defined twice: once with
+// fixed-size operations, and once with runtime-size operations.
+
+// Fixed-size copy from 'src' to 'dst'.
+template <typename Element>
+void copy(char *__restrict dst, const char *__restrict src) {
+ Element::copy(dst, src);
+}
+// Runtime-size copy from 'src' to 'dst'.
+template <typename Element>
+void copy(char *__restrict dst, const char *__restrict src, size_t size) {
+ Element::copy(dst, src, size);
+}
+
+// Fixed-size move from 'src' to 'dst'.
+template <typename Element> void move(char *dst, const char *src) {
+ Element::move(dst, src);
+}
+// Runtime-size move from 'src' to 'dst'.
+template <typename Element> void move(char *dst, const char *src, size_t size) {
+ Element::move(dst, src, size);
+}
+// Runtime-size move from 'src' to 'dst'.
+template <typename Element>
+void move_backward(char *dst, const char *src, size_t size) {
+ Element::move_backward(dst, src, size);
+}
+
+// Fixed-size equality between 'lhs' and 'rhs'.
+template <typename Element> bool equals(const char *lhs, const char *rhs) {
+ return Element::equals(lhs, rhs);
+}
+// Runtime-size equality between 'lhs' and 'rhs'.
+template <typename Element>
+bool equals(const char *lhs, const char *rhs, size_t size) {
+ return Element::equals(lhs, rhs, size);
+}
+
+// Fixed-size three-way comparison between 'lhs' and 'rhs'.
+template <typename Element>
+int three_way_compare(const char *lhs, const char *rhs) {
+ return Element::three_way_compare(lhs, rhs);
+}
+// Runtime-size three-way comparison between 'lhs' and 'rhs'.
+template <typename Element>
+int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+ return Element::three_way_compare(lhs, rhs, size);
+}
+
+// Fixed-size initialization.
+template <typename Element>
+void splat_set(char *dst, const unsigned char value) {
+ Element::splat_set(dst, value);
+}
+// Runtime-size initialization.
+template <typename Element>
+void splat_set(char *dst, const unsigned char value, size_t size) {
+ Element::splat_set(dst, value, size);
+}
+
+// Stack placeholder for Move operations.
+template <typename Element> struct Storage { char bytes[Element::SIZE]; };
+
+// Fixed-size Higher-Order Operations
+// ----------------------------------
+// - Repeated<Type, ElementCount>: Repeat the operation several times in a row.
+// - Chained<Types...>: Chain the operation of several types.
+
+// Repeat the operation several times in a row.
+template <typename Element, size_t ElementCount> struct Repeated {
+ static constexpr size_t SIZE = ElementCount * Element::SIZE;
+
+ static void copy(char *__restrict dst, const char *__restrict src) {
+ for (size_t i = 0; i < ElementCount; ++i) {
+ const size_t offset = i * Element::SIZE;
+ Element::copy(dst + offset, src + offset);
+ }
+ }
+
+ static void move(char *dst, const char *src) {
+ const auto value = load(src);
+ store(dst, value);
+ }
+
+ static bool equals(const char *lhs, const char *rhs) {
+ for (size_t i = 0; i < ElementCount; ++i) {
+ const size_t offset = i * Element::SIZE;
+ if (!Element::equals(lhs + offset, rhs + offset))
+ return false;
+ }
+ return true;
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ for (size_t i = 0; i < ElementCount; ++i) {
+ const size_t offset = i * Element::SIZE;
+ // We make the assumption that 'equals' is cheaper than
+ // 'three_way_compare'.
+ if (Element::equals(lhs + offset, rhs + offset))
+ continue;
+ return Element::three_way_compare(lhs + offset, rhs + offset);
+ }
+ return 0;
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ for (size_t i = 0; i < ElementCount; ++i) {
+ const size_t offset = i * Element::SIZE;
+ Element::splat_set(dst + offset, value);
+ }
+ }
+
+ static Storage<Repeated> load(const char *ptr) {
+ Storage<Repeated> value;
+ copy(reinterpret_cast<char *>(&value), ptr);
+ return value;
+ }
+
+ static void store(char *ptr, Storage<Repeated> value) {
+ copy(ptr, reinterpret_cast<const char *>(&value));
+ }
+};
+
+template <typename Element> struct Repeated<Element, 0> {
+ static void move(char *, const char *) {}
+};
+
+// Chain the operation of several types.
+// For instance, to handle a 3 bytes operation, one can use:
+// Chained<UINT16, UINT8>::Operation();
+template <typename... Types> struct Chained;
+
+template <typename Head, typename... Tail> struct Chained<Head, Tail...> {
+ static constexpr size_t SIZE = Head::SIZE + Chained<Tail...>::SIZE;
+
+ static void copy(char *__restrict dst, const char *__restrict src) {
+ Chained<Tail...>::copy(dst + Head::SIZE, src + Head::SIZE);
+ __llvm_libc::copy<Head>(dst, src);
+ }
+
+ static void move(char *dst, const char *src) {
+ const auto value = Head::load(src);
+ Chained<Tail...>::move(dst + Head::SIZE, src + Head::SIZE);
+ Head::store(dst, value);
+ }
+
+ static bool equals(const char *lhs, const char *rhs) {
+ if (!__llvm_libc::equals<Head>(lhs, rhs))
+ return false;
+ return Chained<Tail...>::equals(lhs + Head::SIZE, rhs + Head::SIZE);
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ if (__llvm_libc::equals<Head>(lhs, rhs))
+ return Chained<Tail...>::three_way_compare(lhs + Head::SIZE,
+ rhs + Head::SIZE);
+ return __llvm_libc::three_way_compare<Head>(lhs, rhs);
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ Chained<Tail...>::splat_set(dst + Head::SIZE, value);
+ __llvm_libc::splat_set<Head>(dst, value);
+ }
+};
+
+template <> struct Chained<> {
+ static constexpr size_t SIZE = 0;
+ static void copy(char *__restrict, const char *__restrict) {}
+ static void move(char *, const char *) {}
+ static bool equals(const char *, const char *) { return true; }
+ static int three_way_compare(const char *, const char *) { return 0; }
+ static void splat_set(char *, const unsigned char) {}
+};
+
+// Overlap ElementA and ElementB so they span Size bytes.
+template <size_t Size, typename ElementA, typename ElementB = ElementA>
+struct Overlap {
+ static constexpr size_t SIZE = Size;
+ static_assert(ElementB::SIZE <= ElementA::SIZE, "ElementB too big");
+ static_assert(ElementA::SIZE <= Size, "ElementA too big");
+ static_assert((ElementA::SIZE + ElementB::SIZE) >= Size,
+ "Elements too small to overlap");
+ static constexpr size_t OFFSET = SIZE - ElementB::SIZE;
+
+ static void copy(char *__restrict dst, const char *__restrict src) {
+ ElementA::copy(dst, src);
+ ElementB::copy(dst + OFFSET, src + OFFSET);
+ }
+
+ static void move(char *dst, const char *src) {
+ const auto value_a = ElementA::load(src);
+ const auto value_b = ElementB::load(src + OFFSET);
+ ElementB::store(dst + OFFSET, value_b);
+ ElementA::store(dst, value_a);
+ }
+
+ static bool equals(const char *lhs, const char *rhs) {
+ if (!ElementA::equals(lhs, rhs))
+ return false;
+ if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
+ return false;
+ return true;
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ if (!ElementA::equals(lhs, rhs))
+ return ElementA::three_way_compare(lhs, rhs);
+ if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
+ return ElementB::three_way_compare(lhs + OFFSET, rhs + OFFSET);
+ return 0;
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ ElementA::splat_set(dst, value);
+ ElementB::splat_set(dst + OFFSET, value);
+ }
+};
+
+// Runtime-size Higher-Order Operations
+// ------------------------------------
+// - Tail<T>: Perform the operation on the last 'T::SIZE' bytes of the buffer.
+// - HeadTail<T>: Perform the operation on the first and last 'T::SIZE' bytes
+// of the buffer.
+// - Loop<T>: Perform a loop of fixed-sized operations.
+
+// Perform the operation on the last 'T::SIZE' bytes of the buffer.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [________XXXXXXXX___]
+//
+// Precondition: `size >= T::SIZE`.
+template <typename T> struct Tail {
+ static void copy(char *__restrict dst, const char *__restrict src,
+ size_t size) {
+ return T::copy(dst + offset(size), src + offset(size));
+ }
+
+ static bool equals(const char *lhs, const char *rhs, size_t size) {
+ return T::equals(lhs + offset(size), rhs + offset(size));
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+ return T::three_way_compare(lhs + offset(size), rhs + offset(size));
+ }
+
+ static void splat_set(char *dst, const unsigned char value, size_t size) {
+ return T::splat_set(dst + offset(size), value);
+ }
+
+ static size_t offset(size_t size) { return size - T::SIZE; }
+};
+
+// Perform the operation on the first and last 'T::SIZE' bytes of the buffer.
+// This is useful for overlapping operations.
+//
+// e.g. with
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `size >= T::SIZE && size <= 2 x T::SIZE`.
+template <typename T> struct HeadTail {
+ static void copy(char *__restrict dst, const char *__restrict src,
+ size_t size) {
+ T::copy(dst, src);
+ Tail<T>::copy(dst, src, size);
+ }
+
+ static void move(char *dst, const char *src, size_t size) {
+ const size_t offset = Tail<T>::offset(size);
+ const auto head_value = T::load(src);
+ const auto tail_value = T::load(src + offset);
+ T::store(dst + offset, tail_value);
+ T::store(dst, head_value);
+ }
+
+ static bool equals(const char *lhs, const char *rhs, size_t size) {
+ if (!T::equals(lhs, rhs))
+ return false;
+ return Tail<T>::equals(lhs, rhs, size);
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+ if (!T::equals(lhs, rhs))
+ return T::three_way_compare(lhs, rhs);
+ return Tail<T>::three_way_compare(lhs, rhs, size);
+ }
+
+ static void splat_set(char *dst, const unsigned char value, size_t size) {
+ T::splat_set(dst, value);
+ Tail<T>::splat_set(dst, value, size);
+ }
+};
+
+// Simple loop ending with a Tail operation.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX_______________________]
+// [__________XXXXXXXX_______________]
+// [__________________XXXXXXXX_______]
+// [______________________XXXXXXXX___]
+//
+// Precondition:
+// - size >= T::SIZE
+template <typename T, typename TailT = T> struct Loop {
+ static_assert(T::SIZE == TailT::SIZE,
+ "Tail type must have the same size as T");
+
+ static void copy(char *__restrict dst, const char *__restrict src,
+ size_t size) {
+ size_t offset = 0;
+ do {
+ T::copy(dst + offset, src + offset);
+ offset += T::SIZE;
+ } while (offset < size - T::SIZE);
+ Tail<TailT>::copy(dst, src, size);
+ }
+
+ // Move forward suitable when dst < src. We load the tail bytes before
+ // handling the loop.
+ //
+ // e.g. Moving two bytes
+ // [ | | | | |]
+ // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+ // [_________________________LLLLLLLL___]
+ // [___LLLLLLLL_________________________]
+ // [_SSSSSSSS___________________________]
+ // [___________LLLLLLLL_________________]
+ // [_________SSSSSSSS___________________]
+ // [___________________LLLLLLLL_________]
+ // [_________________SSSSSSSS___________]
+ // [_______________________SSSSSSSS_____]
+ static void move(char *dst, const char *src, size_t size) {
+ const size_t tail_offset = Tail<T>::offset(size);
+ const auto tail_value = TailT::load(src + tail_offset);
+ size_t offset = 0;
+ do {
+ T::move(dst + offset, src + offset);
+ offset += T::SIZE;
+ } while (offset < size - T::SIZE);
+ TailT::store(dst + tail_offset, tail_value);
+ }
+
+ // Move forward suitable when dst > src. We load the head bytes before
+ // handling the loop.
+ //
+ // e.g. Moving two bytes
+ // [ | | | | |]
+ // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+ // [___LLLLLLLL_________________________]
+ // [_________________________LLLLLLLL___]
+ // [___________________________SSSSSSSS_]
+ // [_________________LLLLLLLL___________]
+ // [___________________SSSSSSSS_________]
+ // [_________LLLLLLLL___________________]
+ // [___________SSSSSSSS_________________]
+ // [_____SSSSSSSS_______________________]
+ static void move_backward(char *dst, const char *src, size_t size) {
+ const auto head_value = TailT::load(src);
+ ptr
diff _t offset = size - T::SIZE;
+ do {
+ T::move(dst + offset, src + offset);
+ offset -= T::SIZE;
+ } while (offset >= 0);
+ TailT::store(dst, head_value);
+ }
+
+ static bool equals(const char *lhs, const char *rhs, size_t size) {
+ size_t offset = 0;
+ do {
+ if (!T::equals(lhs + offset, rhs + offset))
+ return false;
+ offset += T::SIZE;
+ } while (offset < size - T::SIZE);
+ return Tail<TailT>::equals(lhs, rhs, size);
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
+ size_t offset = 0;
+ do {
+ if (!T::equals(lhs + offset, rhs + offset))
+ return T::three_way_compare(lhs + offset, rhs + offset);
+ offset += T::SIZE;
+ } while (offset < size - T::SIZE);
+ return Tail<TailT>::three_way_compare(lhs, rhs, size);
+ }
+
+ static void splat_set(char *dst, const unsigned char value, size_t size) {
+ size_t offset = 0;
+ do {
+ T::splat_set(dst + offset, value);
+ offset += T::SIZE;
+ } while (offset < size - T::SIZE);
+ Tail<TailT>::splat_set(dst, value, size);
+ }
+};
+
+namespace internal {
+
+template <Arg arg> struct ArgSelector {};
+
+template <> struct ArgSelector<Arg::_1> {
+ template <typename T1, typename T2>
+ static T1 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &) {
+ return p1ref;
+ }
+};
+
+template <> struct ArgSelector<Arg::_2> {
+ template <typename T1, typename T2>
+ static T2 *__restrict &Select(T1 *__restrict &, T2 *__restrict &p2ref) {
+ return p2ref;
+ }
+};
+
+// Provides a specialized bump function that adjusts pointers and size so first
+// argument (resp. second argument) gets aligned to Alignment.
+// We make sure the compiler knows about the adjusted pointer alignment.
+// The 'additional_bumps' parameter allows to reach previous / next aligned
+// pointers.
+template <Arg arg, size_t Alignment> struct Align {
+ template <typename T1, typename T2>
+ static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size,
+ int additional_bumps = 0) {
+ auto &aligned_ptr = ArgSelector<arg>::Select(p1ref, p2ref);
+ auto offset = offset_to_next_aligned<Alignment>(aligned_ptr);
+ adjust(offset + additional_bumps * Alignment, p1ref, p2ref, size);
+ aligned_ptr = assume_aligned<Alignment>(aligned_ptr);
+ }
+};
+
+} // namespace internal
+
+// An alignment operation that:
+// - executes the 'AlignmentT' operation
+// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected
+// pointer gets aligned, size is decreased accordingly.
+// - calls the 'NextT' operation.
+//
+// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
+// copy<Align<_16, Arg::Dst>::Then<Loop<_32>>>(dst, src, count);
+template <typename AlignmentT, Arg AlignOn = Arg::_1> struct Align {
+private:
+ static constexpr size_t ALIGNMENT = AlignmentT::SIZE;
+ static_assert(ALIGNMENT > 1, "Alignment must be more than 1");
+ static_assert(is_power2(ALIGNMENT), "Alignment must be a power of 2");
+
+public:
+ template <typename NextT> struct Then {
+ static void copy(char *__restrict dst, const char *__restrict src,
+ size_t size) {
+ AlignmentT::copy(dst, src);
+ internal::Align<AlignOn, ALIGNMENT>::bump(dst, src, size);
+ NextT::copy(dst, src, size);
+ }
+
+ // Move forward suitable when dst < src. The alignment is performed with an
+ // HeadTail operation of size ∈ [Alignment, 2 x Alignment].
+ //
+ // e.g. Moving two bytes and making sure src is then aligned.
+ // [ | | | | ]
+ // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+ // [____LLLLLLLL_____________________]
+ // [___________LLLLLLLL______________]
+ // [_SSSSSSSS________________________]
+ // [________SSSSSSSS_________________]
+ //
+ // e.g. Moving two bytes and making sure dst is then aligned.
+ // [ | | | | ]
+ // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+ // [____LLLLLLLL_____________________]
+ // [______LLLLLLLL___________________]
+ // [_SSSSSSSS________________________]
+ // [___SSSSSSSS______________________]
+ static void move(char *dst, const char *src, size_t size) {
+ char *next_dst = dst;
+ const char *next_src = src;
+ size_t next_size = size;
+ internal::Align<AlignOn, ALIGNMENT>::bump(next_dst, next_src, next_size,
+ 1);
+ HeadTail<AlignmentT>::move(dst, src, size - next_size);
+ NextT::move(next_dst, next_src, next_size);
+ }
+
+ // Move backward suitable when dst > src. The alignment is performed with an
+ // HeadTail operation of size ∈ [Alignment, 2 x Alignment].
+ //
+ // e.g. Moving two bytes backward and making sure src is then aligned.
+ // [ | | | | ]
+ // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+ // [ _________________LLLLLLLL_______]
+ // [ ___________________LLLLLLLL_____]
+ // [____________________SSSSSSSS_____]
+ // [______________________SSSSSSSS___]
+ //
+ // e.g. Moving two bytes and making sure dst is then aligned.
+ // [ | | | | ]
+ // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+ // [ _______________LLLLLLLL_________]
+ // [ ___________________LLLLLLLL_____]
+ // [__________________SSSSSSSS_______]
+ // [______________________SSSSSSSS___]
+ static void move_backward(char *dst, const char *src, size_t size) {
+ char *headtail_dst = dst + size;
+ const char *headtail_src = src + size;
+ size_t headtail_size = 0;
+ internal::Align<AlignOn, ALIGNMENT>::bump(headtail_dst, headtail_src,
+ headtail_size, -2);
+ HeadTail<AlignmentT>::move(headtail_dst, headtail_src, headtail_size);
+ NextT::move_backward(dst, src, size - headtail_size);
+ }
+
+ static bool equals(const char *lhs, const char *rhs, size_t size) {
+ if (!AlignmentT::equals(lhs, rhs))
+ return false;
+ internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
+ return NextT::equals(lhs, rhs, size);
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs,
+ size_t size) {
+ if (!AlignmentT::equals(lhs, rhs))
+ return AlignmentT::three_way_compare(lhs, rhs);
+ internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
+ return NextT::three_way_compare(lhs, rhs, size);
+ }
+
+ static void splat_set(char *dst, const unsigned char value, size_t size) {
+ AlignmentT::splat_set(dst, value);
+ char *dummy = nullptr;
+ internal::Align<Arg::_1, ALIGNMENT>::bump(dst, dummy, size);
+ NextT::splat_set(dst, value, size);
+ }
+ };
+};
+
+// An operation that allows to skip the specified amount of bytes.
+template <ptr
diff _t Bytes> struct Skip {
+ template <typename NextT> struct Then {
+ static void copy(char *__restrict dst, const char *__restrict src,
+ size_t size) {
+ NextT::copy(dst + Bytes, src + Bytes, size - Bytes);
+ }
+
+ static void copy(char *__restrict dst, const char *__restrict src) {
+ NextT::copy(dst + Bytes, src + Bytes);
+ }
+
+ static bool equals(const char *lhs, const char *rhs, size_t size) {
+ return NextT::equals(lhs + Bytes, rhs + Bytes, size - Bytes);
+ }
+
+ static bool equals(const char *lhs, const char *rhs) {
+ return NextT::equals(lhs + Bytes, rhs + Bytes);
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs,
+ size_t size) {
+ return NextT::three_way_compare(lhs + Bytes, rhs + Bytes, size - Bytes);
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ return NextT::three_way_compare(lhs + Bytes, rhs + Bytes);
+ }
+
+ static void splat_set(char *dst, const unsigned char value, size_t size) {
+ NextT::splat_set(dst + Bytes, value, size - Bytes);
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ NextT::splat_set(dst + Bytes, value);
+ }
+ };
+};
+
+// Fixed-size Builtin Operations
+// -----------------------------
+// Note: Do not use 'builtin' right now as it requires the implementation of the
+// `_inline` versions of all the builtins. Theoretically, Clang can still turn
+// them into calls to the C library leading to reentrancy problems.
+namespace builtin {
+
+#ifndef __has_builtin
+#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
+#endif
+
+template <size_t Size> struct Builtin {
+ static constexpr size_t SIZE = Size;
+
+ static void copy(char *__restrict dst, const char *__restrict src) {
+#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
+ for_loop_copy(dst, src);
+#elif __has_builtin(__builtin_memcpy_inline)
+ // __builtin_memcpy_inline guarantees to never call external functions.
+ // Unfortunately it is not widely available.
+ __builtin_memcpy_inline(dst, src, SIZE);
+#else
+ for_loop_copy(dst, src);
+#endif
+ }
+
+ static void move(char *dst, const char *src) {
+#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
+ for_loop_move(dst, src);
+#elif __has_builtin(__builtin_memmove)
+ __builtin_memmove(dst, src, SIZE);
+#else
+ for_loop_move(dst, src);
+#endif
+ }
+
+#if __has_builtin(__builtin_memcmp_inline)
+#define LLVM_LIBC_MEMCMP __builtin_memcmp_inline
+#else
+#define LLVM_LIBC_MEMCMP __builtin_memcmp
+#endif
+
+ static bool equals(const char *lhs, const char *rhs) {
+ return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE) == 0;
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE);
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ __builtin_memset(dst, value, SIZE);
+ }
+
+private:
+ // Copies `SIZE` bytes from `src` to `dst` using a for loop.
+ // This code requires the use of `-fno-builtin-memcpy` to prevent the compiler
+ // from turning the for-loop back into `__builtin_memcpy`.
+ static void for_loop_copy(char *__restrict dst, const char *__restrict src) {
+ for (size_t i = 0; i < SIZE; ++i)
+ dst[i] = src[i];
+ }
+
+ static void for_loop_move(char *dst, const char *src) {
+ for (size_t i = 0; i < SIZE; ++i)
+ dst[i] = src[i];
+ }
+};
+
+using _1 = Builtin<1>;
+using _2 = Builtin<2>;
+using _3 = Builtin<3>;
+using _4 = Builtin<4>;
+using _8 = Builtin<8>;
+using _16 = Builtin<16>;
+using _32 = Builtin<32>;
+using _64 = Builtin<64>;
+using _128 = Builtin<128>;
+
+} // namespace builtin
+
+// Fixed-size Scalar Operations
+// ----------------------------
+namespace scalar {
+
+// The Scalar type makes use of simple sized integers.
+template <typename T> struct Scalar {
+ static constexpr size_t SIZE = sizeof(T);
+
+ static void copy(char *__restrict dst, const char *__restrict src) {
+ store(dst, load(src));
+ }
+
+ static void move(char *dst, const char *src) { store(dst, load(src)); }
+
+ static bool equals(const char *lhs, const char *rhs) {
+ return load(lhs) == load(rhs);
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ return scalar_three_way_compare(load(lhs), load(rhs));
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ store(dst, get_splatted_value(value));
+ }
+
+ static int scalar_three_way_compare(T a, T b);
+
+ static T load(const char *ptr) {
+ T value;
+ builtin::Builtin<SIZE>::copy(reinterpret_cast<char *>(&value), ptr);
+ return value;
+ }
+ static void store(char *ptr, T value) {
+ builtin::Builtin<SIZE>::copy(ptr, reinterpret_cast<const char *>(&value));
+ }
+
+private:
+ static T get_splatted_value(const unsigned char value) {
+ return T(~0) / T(0xFF) * T(value);
+ }
+};
+
+template <>
+inline int Scalar<uint8_t>::scalar_three_way_compare(uint8_t a, uint8_t b) {
+ const int16_t la = Endian::to_big_endian(a);
+ const int16_t lb = Endian::to_big_endian(b);
+ return la - lb;
+}
+template <>
+inline int Scalar<uint16_t>::scalar_three_way_compare(uint16_t a, uint16_t b) {
+ const int32_t la = Endian::to_big_endian(a);
+ const int32_t lb = Endian::to_big_endian(b);
+ return la - lb;
+}
+template <>
+inline int Scalar<uint32_t>::scalar_three_way_compare(uint32_t a, uint32_t b) {
+ const uint32_t la = Endian::to_big_endian(a);
+ const uint32_t lb = Endian::to_big_endian(b);
+ return la > lb ? 1 : la < lb ? -1 : 0;
+}
+template <>
+inline int Scalar<uint64_t>::scalar_three_way_compare(uint64_t a, uint64_t b) {
+ const uint64_t la = Endian::to_big_endian(a);
+ const uint64_t lb = Endian::to_big_endian(b);
+ return la > lb ? 1 : la < lb ? -1 : 0;
+}
+
+using UINT8 = Scalar<uint8_t>; // 1 Byte
+using UINT16 = Scalar<uint16_t>; // 2 Bytes
+using UINT32 = Scalar<uint32_t>; // 4 Bytes
+using UINT64 = Scalar<uint64_t>; // 8 Bytes
+
+using _1 = UINT8;
+using _2 = UINT16;
+using _3 = Chained<UINT16, UINT8>;
+using _4 = UINT32;
+using _8 = UINT64;
+using _16 = Repeated<_8, 2>;
+using _32 = Repeated<_8, 4>;
+using _64 = Repeated<_8, 8>;
+using _128 = Repeated<_8, 16>;
+
+} // namespace scalar
+} // namespace __llvm_libc
+
+#include <src/string/memory_utils/elements_aarch64.h>
+#include <src/string/memory_utils/elements_x86.h>
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
diff --git a/libc/src/string/memory_utils/elements_aarch64.h b/libc/src/string/memory_utils/elements_aarch64.h
new file mode 100644
index 0000000000000..0529df70b87a9
--- /dev/null
+++ b/libc/src/string/memory_utils/elements_aarch64.h
@@ -0,0 +1,130 @@
+//===-- Elementary operations for aarch64 --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+
+#include <src/string/memory_utils/elements.h>
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif
+
+namespace __llvm_libc {
+namespace aarch64_memset {
+#ifdef __ARM_NEON
+struct Splat8 {
+ static constexpr size_t SIZE = 8;
+ static void splat_set(char *dst, const unsigned char value) {
+ vst1_u8((uint8_t *)dst, vdup_n_u8(value));
+ }
+};
+
+struct Splat16 {
+ static constexpr size_t SIZE = 16;
+ static void splat_set(char *dst, const unsigned char value) {
+ vst1q_u8((uint8_t *)dst, vdupq_n_u8(value));
+ }
+};
+
+using _8 = Splat8;
+using _16 = Splat16;
+#else
+using _8 = __llvm_libc::scalar::_8;
+using _16 = Repeated<_8, 2>;
+#endif // __ARM_NEON
+
+using _1 = __llvm_libc::scalar::_1;
+using _2 = __llvm_libc::scalar::_2;
+using _3 = __llvm_libc::scalar::_3;
+using _4 = __llvm_libc::scalar::_4;
+using _32 = Chained<_16, _16>;
+using _64 = Chained<_32, _32>;
+
+struct Zva64 {
+ static constexpr size_t SIZE = 64;
+
+ static void splat_set(char *dst, const unsigned char) {
+#if __SIZEOF_POINTER__ == 4
+ asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+ asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+ }
+};
+
+inline static bool hasZva() {
+ uint64_t zva_val;
+ asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+ // DC ZVA is permitted if DZP, bit [4] is zero.
+ // BS, bits [3:0] is log2 of the block size in words.
+ // So the next line checks whether the instruction is permitted and block size
+ // is 16 words (i.e. 64 bytes).
+ return (zva_val & 0b11111) == 0b00100;
+}
+
+} // namespace aarch64_memset
+
+namespace aarch64 {
+
+using _1 = __llvm_libc::scalar::_1;
+using _2 = __llvm_libc::scalar::_2;
+using _3 = __llvm_libc::scalar::_3;
+using _4 = __llvm_libc::scalar::_4;
+using _8 = __llvm_libc::scalar::_8;
+using _16 = __llvm_libc::scalar::_16;
+
+#ifdef __ARM_NEON
+struct N32 {
+ static constexpr size_t SIZE = 32;
+ static bool equals(const char *lhs, const char *rhs) {
+ uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
+ uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
+ uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
+ uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
+ uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
+ uint64_t res =
+ vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
+ return res == 0;
+ }
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
+ uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
+ uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
+ uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
+ uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
+ uint64_t res =
+ vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
+ if (res == 0)
+ return 0;
+ size_t index = (__builtin_ctzl(res) >> 3) << 2;
+ uint32_t l = *((const uint32_t *)(lhs + index));
+ uint32_t r = *((const uint32_t *)(rhs + index));
+ return __llvm_libc::scalar::_4::scalar_three_way_compare(l, r);
+ }
+};
+
+using _32 = N32;
+using _64 = Repeated<_32, 2>;
+#else
+using _32 = __llvm_libc::scalar::_32;
+using _64 = __llvm_libc::scalar::_64;
+#endif // __ARM_NEON
+
+} // namespace aarch64
+} // namespace __llvm_libc
+
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
diff --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h
new file mode 100644
index 0000000000000..7a2a8ccef4e34
--- /dev/null
+++ b/libc/src/string/memory_utils/elements_x86.h
@@ -0,0 +1,189 @@
+//===-- Elementary operations for x86 -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_X86)
+
+#include <stddef.h> // size_t
+#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#endif // __SSE2__
+
+#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar
+
+// Fixed-size Vector Operations
+// ----------------------------
+
+namespace __llvm_libc {
+namespace x86 {
+
+#ifdef __SSE2__
+template <typename Base> struct Vector : public Base {
+ static void copy(char *__restrict dst, const char *__restrict src) {
+ Base::store(dst, Base::load(src));
+ }
+
+ static void move(char *dst, const char *src) {
+ Base::store(dst, Base::load(src));
+ }
+
+ static bool equals(const char *a, const char *b) {
+ return Base::not_equal_mask(Base::load(a), Base::load(b)) == 0;
+ }
+
+ static int three_way_compare(const char *a, const char *b) {
+ const auto mask = Base::not_equal_mask(Base::load(a), Base::load(b));
+ if (!mask)
+ return 0;
+ return char_
diff (a, b, mask);
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ Base::store(dst, Base::get_splatted_value(value));
+ }
+
+ static int char_
diff (const char *a, const char *b, uint64_t mask) {
+ const size_t
diff _index = __builtin_ctzll(mask);
+ const int ca = (unsigned char)a[
diff _index];
+ const int cb = (unsigned char)b[
diff _index];
+ return ca - cb;
+ }
+};
+
+struct M128 {
+ static constexpr size_t SIZE = 16;
+ using T = char __attribute__((__vector_size__(SIZE)));
+ static uint16_t mask(T value) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return static_cast<uint16_t>(
+ _mm_movemask_epi8(cpp::bit_cast<__m128i>(value)));
+ }
+ static uint16_t not_equal_mask(T a, T b) { return mask(a != b); }
+ static T load(const char *ptr) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return cpp::bit_cast<T>(
+ _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr)));
+ }
+ static void store(char *ptr, T value) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr),
+ cpp::bit_cast<__m128i>(value));
+ }
+ static T get_splatted_value(const char v) {
+ const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+ return splatted;
+ }
+};
+
+using Vector128 = Vector<M128>; // 16 Bytes
+
+#ifdef __AVX2__
+struct M256 {
+ static constexpr size_t SIZE = 32;
+ using T = char __attribute__((__vector_size__(SIZE)));
+ static uint32_t mask(T value) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return _mm256_movemask_epi8(cpp::bit_cast<__m256i>(value));
+ }
+ static uint32_t not_equal_mask(T a, T b) { return mask(a != b); }
+ static T load(const char *ptr) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return cpp::bit_cast<T>(
+ _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr)));
+ }
+ static void store(char *ptr, T value) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
+ cpp::bit_cast<__m256i>(value));
+ }
+ static T get_splatted_value(const char v) {
+ const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+ v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+ return splatted;
+ }
+};
+
+using Vector256 = Vector<M256>; // 32 Bytes
+
+#if defined(__AVX512F__) and defined(__AVX512BW__)
+struct M512 {
+ static constexpr size_t SIZE = 64;
+ using T = char __attribute__((__vector_size__(SIZE)));
+ static uint64_t not_equal_mask(T a, T b) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(a),
+ cpp::bit_cast<__m512i>(b));
+ }
+ static T load(const char *ptr) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return cpp::bit_cast<T>(_mm512_loadu_epi8(ptr));
+ }
+ static void store(char *ptr, T value) {
+ // NOLINTNEXTLINE(llvmlibc-callee-namespace)
+ return _mm512_storeu_epi8(ptr, cpp::bit_cast<__m512i>(value));
+ }
+ static T get_splatted_value(const char v) {
+ const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+ v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+ v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
+ v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
+ return splatted;
+ }
+};
+using Vector512 = Vector<M512>;
+
+#endif // defined(__AVX512F__) and defined(__AVX512BW__)
+#endif // __AVX2__
+#endif // __SSE2__
+
+using _1 = __llvm_libc::scalar::_1;
+using _2 = __llvm_libc::scalar::_2;
+using _3 = __llvm_libc::scalar::_3;
+using _4 = __llvm_libc::scalar::_4;
+using _8 = __llvm_libc::scalar::_8;
+#if defined(__AVX512F__) && defined(__AVX512BW__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::x86::Vector256;
+using _64 = __llvm_libc::x86::Vector512;
+using _128 = __llvm_libc::Repeated<_64, 2>;
+#elif defined(__AVX2__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::x86::Vector256;
+using _64 = __llvm_libc::Repeated<_32, 2>;
+using _128 = __llvm_libc::Repeated<_32, 4>;
+#elif defined(__SSE2__)
+using _16 = __llvm_libc::x86::Vector128;
+using _32 = __llvm_libc::Repeated<_16, 2>;
+using _64 = __llvm_libc::Repeated<_16, 4>;
+using _128 = __llvm_libc::Repeated<_16, 8>;
+#else
+using _16 = __llvm_libc::Repeated<_8, 2>;
+using _32 = __llvm_libc::Repeated<_8, 4>;
+using _64 = __llvm_libc::Repeated<_8, 8>;
+using _128 = __llvm_libc::Repeated<_8, 16>;
+#endif
+
+struct Accelerator {
+ static void copy(char *dst, const char *src, size_t count) {
+ asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+ }
+};
+
+} // namespace x86
+} // namespace __llvm_libc
+
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h
index 0d6f433298bf9..f2079468f2be3 100644
--- a/libc/src/string/memory_utils/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/memcmp_implementations.h
@@ -11,133 +11,92 @@
#include "src/__support/architectures.h"
#include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
-#include "src/string/memory_utils/utils.h"
+#include "src/string/memory_utils/elements.h"
#include <stddef.h> // size_t
namespace __llvm_libc {
-static inline MemcmpReturnType inline_memcmp_embedded_tiny(CPtr p1, CPtr p2,
- size_t count) {
-#pragma nounroll
- for (size_t offset = 0; offset < count; ++offset)
- if (auto value = generic::Memcmp<1>::block(p1, p2))
- return value;
- return MemcmpReturnType::ZERO();
-}
-
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
-static inline MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (unlikely(count >= 384)) {
- if (auto value = generic::Memcmp<16>::block(p1, p2))
- return value;
- align_to_next_boundary<16, Arg::P1>(p1, p2, count);
- }
- return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
+static inline int inline_memcmp(const char *lhs, const char *rhs,
+ size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
-static inline MemcmpReturnType inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (unlikely(count >= 384)) {
- if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
- return value;
- align_to_next_boundary<16, Arg::P1>(p1, p2, count);
- }
- return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
-}
-
-static inline MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (count <= 32)
- return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
- if (count <= 64)
- return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
- if (count <= 128)
- return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
- if (unlikely(count >= 384)) {
- if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
- return value;
- align_to_next_boundary<32, Arg::P1>(p1, p2, count);
- }
- return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
-}
-
-static inline MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2,
- size_t count) {
+ /////////////////////////////////////////////////////////////////////////////
+ // LLVM_LIBC_ARCH_X86
+ /////////////////////////////////////////////////////////////////////////////
+ using namespace __llvm_libc::x86;
+ if (count == 0)
+ return 0;
+ if (count == 1)
+ return three_way_compare<_1>(lhs, rhs);
+ if (count == 2)
+ return three_way_compare<_2>(lhs, rhs);
+ if (count == 3)
+ return three_way_compare<_3>(lhs, rhs);
+ if (count <= 8)
+ return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+ if (count <= 16)
+ return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
if (count <= 32)
- return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
+ return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
if (count <= 64)
- return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
+ return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
if (count <= 128)
- return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
- if (unlikely(count >= 384)) {
- if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
- return value;
- align_to_next_boundary<64, Arg::P1>(p1, p2, count);
- }
- return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-static inline MemcmpReturnType inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2,
- size_t count) {
- if (unlikely(count >= 128)) { // [128, ∞]
- if (auto value = generic::Memcmp<16>::block(p1, p2))
- return value;
- align_to_next_boundary<16, Arg::P1>(p1, p2, count);
- return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
- }
+ return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
+ return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ /////////////////////////////////////////////////////////////////////////////
+ // LLVM_LIBC_ARCH_AARCH64
+ /////////////////////////////////////////////////////////////////////////////
+ using namespace ::__llvm_libc::aarch64;
+ if (count == 0) // [0, 0]
+ return 0;
+ if (count == 1) // [1, 1]
+ return three_way_compare<_1>(lhs, rhs);
+ if (count == 2) // [2, 2]
+ return three_way_compare<_2>(lhs, rhs);
+ if (count == 3) // [3, 3]
+ return three_way_compare<_3>(lhs, rhs);
+ if (count < 8) // [4, 7]
+ return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+ if (count < 16) // [8, 15]
+ return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+ if (unlikely(count >= 128)) // [128, ∞]
+ return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
+ if (!equals<_16>(lhs, rhs)) // [16, 16]
+ return three_way_compare<_16>(lhs, rhs);
if (count < 32) // [17, 31]
- return generic::Memcmp<16>::tail(p1, p2, count);
- if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
- return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
+ return three_way_compare<Tail<_16>>(lhs, rhs, count);
+ if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
+ return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
if (count < 64) // [33, 63]
- return generic::Memcmp<32>::tail(p1, p2, count);
+ return three_way_compare<Tail<_32>>(lhs, rhs, count);
// [64, 127]
- return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
+ return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
+#else
+ /////////////////////////////////////////////////////////////////////////////
+ // Default
+ /////////////////////////////////////////////////////////////////////////////
+ using namespace ::__llvm_libc::scalar;
-static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64)
if (count == 0)
- return MemcmpReturnType::ZERO();
+ return 0;
if (count == 1)
- return generic::Memcmp<1>::block(p1, p2);
+ return three_way_compare<_1>(lhs, rhs);
if (count == 2)
- return generic::Memcmp<2>::block(p1, p2);
+ return three_way_compare<_2>(lhs, rhs);
if (count == 3)
- return generic::Memcmp<3>::block(p1, p2);
+ return three_way_compare<_3>(lhs, rhs);
if (count <= 8)
- return generic::Memcmp<4>::head_tail(p1, p2, count);
+ return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
if (count <= 16)
- return generic::Memcmp<8>::head_tail(p1, p2, count);
-#if defined(LLVM_LIBC_ARCH_X86)
- if constexpr (x86::kAvx512BW)
- return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
- else if constexpr (x86::kAvx2)
- return inline_memcmp_x86_avx2_gt16(p1, p2, count);
- else if constexpr (x86::kSse2)
- return inline_memcmp_x86_sse2_gt16(p1, p2, count);
- else
- return inline_memcmp_generic_gt16(p1, p2, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- if constexpr (aarch64::kNeon)
- return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
- else
- return inline_memcmp_generic_gt16(p1, p2, count);
-#endif
-#elif defined(LLVM_LIBC_ARCH_ARM)
- return inline_memcmp_embedded_tiny(p1, p2, count);
-#else
-#error "Unsupported platform"
+ return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+ if (count <= 32)
+ return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
+ if (count <= 64)
+ return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
+ if (count <= 128)
+ return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
+ return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
#endif
}
diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
index 2a2ce7561f17d..3385d40fbc56b 100644
--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -11,122 +11,142 @@
#include "src/__support/architectures.h"
#include "src/__support/common.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
-namespace __llvm_libc {
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+// Guided Optimization as the optimized code can take advantage of branching
+// probabilities.
+// - It also allows for easier customization and favors testing multiple
+// implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+// with little change on the code side.
-static inline void inline_memcpy_embedded_tiny(char *__restrict dst,
- const char *__restrict src,
- size_t count) {
-#pragma nounroll
- for (size_t offset = 0; offset < count; ++offset)
- builtin::Memcpy<1>::block(dst + offset, src + offset);
-}
+namespace __llvm_libc {
+static inline void inline_memcpy(char *__restrict dst,
+ const char *__restrict src, size_t count) {
+ using namespace __llvm_libc::builtin;
#if defined(LLVM_LIBC_ARCH_X86)
-static inline void inline_memcpy_x86(char *__restrict dst,
- const char *__restrict src, size_t count) {
+ /////////////////////////////////////////////////////////////////////////////
+ // LLVM_LIBC_ARCH_X86
+ /////////////////////////////////////////////////////////////////////////////
+
+ // Whether to use only rep;movsb.
+ constexpr bool USE_ONLY_REP_MOVSB =
+ LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
+
+ // kRepMovsBSize == -1 : Only CopyAligned is used.
+ // kRepMovsBSize == 0 : Only RepMovsb is used.
+ // else CopyAligned is used up to kRepMovsBSize and then RepMovsb.
+ constexpr size_t REP_MOVS_B_SIZE =
+#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE)
+ LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
+#else
+ -1;
+#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
+
+ // Whether target supports AVX instructions.
+ constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
+
+#if defined(__AVX__)
+ using LoopBlockSize = _64;
+#else
+ using LoopBlockSize = _32;
+#endif
+
+ if (USE_ONLY_REP_MOVSB)
+ return copy<x86::Accelerator>(dst, src, count);
+
if (count == 0)
return;
if (count == 1)
- return builtin::Memcpy<1>::block(dst, src);
+ return copy<_1>(dst, src);
if (count == 2)
- return builtin::Memcpy<2>::block(dst, src);
+ return copy<_2>(dst, src);
if (count == 3)
- return builtin::Memcpy<3>::block(dst, src);
+ return copy<_3>(dst, src);
if (count == 4)
- return builtin::Memcpy<4>::block(dst, src);
+ return copy<_4>(dst, src);
if (count < 8)
- return builtin::Memcpy<4>::head_tail(dst, src, count);
+ return copy<HeadTail<_4>>(dst, src, count);
if (count < 16)
- return builtin::Memcpy<8>::head_tail(dst, src, count);
+ return copy<HeadTail<_8>>(dst, src, count);
if (count < 32)
- return builtin::Memcpy<16>::head_tail(dst, src, count);
+ return copy<HeadTail<_16>>(dst, src, count);
if (count < 64)
- return builtin::Memcpy<32>::head_tail(dst, src, count);
+ return copy<HeadTail<_32>>(dst, src, count);
if (count < 128)
- return builtin::Memcpy<64>::head_tail(dst, src, count);
- if (x86::kAvx && count < 256)
- return builtin::Memcpy<128>::head_tail(dst, src, count);
- builtin::Memcpy<32>::block(dst, src);
- align_to_next_boundary<32, Arg::Dst>(dst, src, count);
- static constexpr size_t kBlockSize = x86::kAvx ? 64 : 32;
- return builtin::Memcpy<kBlockSize>::loop_and_tail(dst, src, count);
-}
-
-static inline void inline_memcpy_x86_maybe_interpose_repmovsb(
- char *__restrict dst, const char *__restrict src, size_t count) {
- // Whether to use rep;movsb exclusively, not at all, or only above a certain
- // threshold.
- // TODO: Use only a single preprocessor definition to simplify the code.
-#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
-#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1
-#endif
-
- static constexpr bool kUseOnlyRepMovsb =
- LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB);
- static constexpr size_t kRepMovsbThreshold =
- LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE;
- if constexpr (kUseOnlyRepMovsb)
- return x86::Memcpy::repmovsb(dst, src, count);
- else if constexpr (kRepMovsbThreshold >= 0) {
- if (unlikely(count >= kRepMovsbThreshold))
- return x86::Memcpy::repmovsb(dst, src, count);
- else
- return inline_memcpy_x86(dst, src, count);
- } else {
- return inline_memcpy_x86(dst, src, count);
- }
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-static inline void inline_memcpy_aarch64(char *__restrict dst,
- const char *__restrict src,
- size_t count) {
+ return copy<HeadTail<_64>>(dst, src, count);
+ if (HAS_AVX && count < 256)
+ return copy<HeadTail<_128>>(dst, src, count);
+ if (count <= REP_MOVS_B_SIZE)
+ return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
+ count);
+ return copy<x86::Accelerator>(dst, src, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ /////////////////////////////////////////////////////////////////////////////
+ // LLVM_LIBC_ARCH_AARCH64
+ /////////////////////////////////////////////////////////////////////////////
if (count == 0)
return;
if (count == 1)
- return builtin::Memcpy<1>::block(dst, src);
+ return copy<_1>(dst, src);
if (count == 2)
- return builtin::Memcpy<2>::block(dst, src);
+ return copy<_2>(dst, src);
if (count == 3)
- return builtin::Memcpy<3>::block(dst, src);
+ return copy<_3>(dst, src);
if (count == 4)
- return builtin::Memcpy<4>::block(dst, src);
+ return copy<_4>(dst, src);
if (count < 8)
- return builtin::Memcpy<4>::head_tail(dst, src, count);
+ return copy<HeadTail<_4>>(dst, src, count);
if (count < 16)
- return builtin::Memcpy<8>::head_tail(dst, src, count);
+ return copy<HeadTail<_8>>(dst, src, count);
if (count < 32)
- return builtin::Memcpy<16>::head_tail(dst, src, count);
+ return copy<HeadTail<_16>>(dst, src, count);
if (count < 64)
- return builtin::Memcpy<32>::head_tail(dst, src, count);
+ return copy<HeadTail<_32>>(dst, src, count);
if (count < 128)
- return builtin::Memcpy<64>::head_tail(dst, src, count);
- builtin::Memcpy<16>::block(dst, src);
- align_to_next_boundary<16, Arg::Src>(dst, src, count);
- return builtin::Memcpy<64>::loop_and_tail(dst, src, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-static inline void inline_memcpy(char *__restrict dst,
- const char *__restrict src, size_t count) {
- using namespace __llvm_libc::builtin;
-#if defined(LLVM_LIBC_ARCH_X86)
- return inline_memcpy_x86_maybe_interpose_repmovsb(dst, src, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- return inline_memcpy_aarch64(dst, src, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
- return inline_memcpy_embedded_tiny(dst, src, count);
+ return copy<HeadTail<_64>>(dst, src, count);
+ return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
#else
-#error "Unsupported platform"
+ /////////////////////////////////////////////////////////////////////////////
+ // Default
+ /////////////////////////////////////////////////////////////////////////////
+ if (count == 0)
+ return;
+ if (count == 1)
+ return copy<_1>(dst, src);
+ if (count == 2)
+ return copy<_2>(dst, src);
+ if (count == 3)
+ return copy<_3>(dst, src);
+ if (count == 4)
+ return copy<_4>(dst, src);
+ if (count < 8)
+ return copy<HeadTail<_4>>(dst, src, count);
+ if (count < 16)
+ return copy<HeadTail<_8>>(dst, src, count);
+ if (count < 32)
+ return copy<HeadTail<_16>>(dst, src, count);
+ if (count < 64)
+ return copy<HeadTail<_32>>(dst, src, count);
+ if (count < 128)
+ return copy<HeadTail<_64>>(dst, src, count);
+ return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
#endif
}
diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h
index a456b58fe6e1b..f1611a32807cb 100644
--- a/libc/src/string/memory_utils/memset_implementations.h
+++ b/libc/src/string/memory_utils/memset_implementations.h
@@ -10,102 +10,126 @@
#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
#include "src/__support/architectures.h"
-#include "src/string/memory_utils/op_aarch64.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/elements.h"
#include "src/string/memory_utils/utils.h"
#include <stddef.h> // size_t
namespace __llvm_libc {
-inline static void inline_memset_embedded_tiny(Ptr dst, uint8_t value,
- size_t count) {
-#pragma nounroll
- for (size_t offset = 0; offset < count; ++offset)
- generic::Memset<1, 1>::block(dst + offset, value);
-}
-
+// A general purpose implementation assuming cheap unaligned writes for sizes:
+// 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32
+// or 64 Bytes at a time, the compiler will expand them as needed.
+//
+// This implementation is subject to change as we benchmark more processors. We
+// may also want to customize it for processors with specialized instructions
+// that performs better (e.g. `rep stosb`).
+//
+// A note on the apparent discrepancy in the use of 32 vs 64 Bytes writes.
+// We want to balance two things here:
+// - The number of redundant writes (when using `SetBlockOverlap`),
+// - The number of conditionals for sizes <=128 (~90% of memset calls are for
+// such sizes).
+//
+// For the range 64-128:
+// - SetBlockOverlap<64> uses no conditionals but always writes 128 Bytes this
+// is wasteful near 65 but efficient toward 128.
+// - SetAlignedBlocks<32> would consume between 3 and 4 conditionals and write
+// 96 or 128 Bytes.
+// - Another approach could be to use an hybrid approach copy<64>+Overlap<32>
+// for 65-96 and copy<96>+Overlap<32> for 97-128
+//
+// Benchmarks showed that redundant writes were cheap (for Intel X86) but
+// conditional were expensive, even on processor that do not support writing 64B
+// at a time (pre-AVX512F). We also want to favor short functions that allow
+// more hot code to fit in the iL1 cache.
+//
+// Above 128 we have to use conditionals since we don't know the upper bound in
+// advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
+// may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
+// superior for sizes that mattered.
+inline static void inline_memset(char *dst, unsigned char value, size_t count) {
#if defined(LLVM_LIBC_ARCH_X86)
-template <size_t MaxSize>
-inline static void inline_memset_x86(Ptr dst, uint8_t value, size_t count) {
+ /////////////////////////////////////////////////////////////////////////////
+ // LLVM_LIBC_ARCH_X86
+ /////////////////////////////////////////////////////////////////////////////
+ using namespace __llvm_libc::x86;
if (count == 0)
return;
if (count == 1)
- return generic::Memset<1, MaxSize>::block(dst, value);
+ return splat_set<_1>(dst, value);
if (count == 2)
- return generic::Memset<2, MaxSize>::block(dst, value);
+ return splat_set<_2>(dst, value);
if (count == 3)
- return generic::Memset<3, MaxSize>::block(dst, value);
+ return splat_set<_3>(dst, value);
if (count <= 8)
- return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+ return splat_set<HeadTail<_4>>(dst, value, count);
if (count <= 16)
- return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+ return splat_set<HeadTail<_8>>(dst, value, count);
if (count <= 32)
- return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+ return splat_set<HeadTail<_16>>(dst, value, count);
if (count <= 64)
- return generic::Memset<32, MaxSize>::head_tail(dst, value, count);
+ return splat_set<HeadTail<_32>>(dst, value, count);
if (count <= 128)
- return generic::Memset<64, MaxSize>::head_tail(dst, value, count);
- // Aligned loop
- generic::Memset<32, MaxSize>::block(dst, value);
- align_to_next_boundary<32>(dst, count);
- return generic::Memset<32, MaxSize>::loop_and_tail(dst, value, count);
-}
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-template <size_t MaxSize>
-inline static void inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) {
+ return splat_set<HeadTail<_64>>(dst, value, count);
+ return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+ /////////////////////////////////////////////////////////////////////////////
+ // LLVM_LIBC_ARCH_AARCH64
+ /////////////////////////////////////////////////////////////////////////////
+ using namespace __llvm_libc::aarch64_memset;
if (count == 0)
return;
if (count <= 3) {
- generic::Memset<1, MaxSize>::block(dst, value);
+ splat_set<_1>(dst, value);
if (count > 1)
- generic::Memset<2, MaxSize>::tail(dst, value, count);
+ splat_set<Tail<_2>>(dst, value, count);
return;
}
if (count <= 8)
- return generic::Memset<4, MaxSize>::head_tail(dst, value, count);
+ return splat_set<HeadTail<_4>>(dst, value, count);
if (count <= 16)
- return generic::Memset<8, MaxSize>::head_tail(dst, value, count);
+ return splat_set<HeadTail<_8>>(dst, value, count);
if (count <= 32)
- return generic::Memset<16, MaxSize>::head_tail(dst, value, count);
+ return splat_set<HeadTail<_16>>(dst, value, count);
if (count <= (32 + 64)) {
- generic::Memset<32, MaxSize>::block(dst, value);
+ splat_set<_32>(dst, value);
if (count <= 64)
- return generic::Memset<32, MaxSize>::tail(dst, value, count);
- generic::Memset<32, MaxSize>::block(dst + 32, value);
- generic::Memset<32, MaxSize>::tail(dst, value, count);
+ return splat_set<Tail<_32>>(dst, value, count);
+ splat_set<Skip<32>::Then<_32>>(dst, value);
+ splat_set<Tail<_32>>(dst, value, count);
return;
}
- if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
- generic::Memset<64, MaxSize>::block(dst, 0);
- align_to_next_boundary<64>(dst, count);
- return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
- } else {
- generic::Memset<16, MaxSize>::block(dst, value);
- align_to_next_boundary<16>(dst, count);
- return generic::Memset<64, MaxSize>::loop_and_tail(dst, value, count);
- }
-}
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
-#if defined(LLVM_LIBC_ARCH_X86)
- static constexpr size_t kMaxSize = x86::kAvx512F ? 64
- : x86::kAvx ? 32
- : x86::kSse2 ? 16
- : 8;
- return inline_memset_x86<kMaxSize>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
- static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
- return inline_memset_aarch64<kMaxSize>(dst, value, count);
-#elif defined(LLVM_LIBC_ARCH_ARM)
- return inline_memset_embedded_tiny(dst, value, count);
+ if (count >= 448 && value == 0 && hasZva())
+ return splat_set<Align<_64, Arg::_1>::Then<Loop<Zva64, _64>>>(dst, 0,
+ count);
+ else
+ return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
#else
-#error "Unsupported platform"
+ /////////////////////////////////////////////////////////////////////////////
+ // Default
+ /////////////////////////////////////////////////////////////////////////////
+ using namespace ::__llvm_libc::scalar;
+
+ if (count == 0)
+ return;
+ if (count == 1)
+ return splat_set<_1>(dst, value);
+ if (count == 2)
+ return splat_set<_2>(dst, value);
+ if (count == 3)
+ return splat_set<_3>(dst, value);
+ if (count <= 8)
+ return splat_set<HeadTail<_4>>(dst, value, count);
+ if (count <= 16)
+ return splat_set<HeadTail<_8>>(dst, value, count);
+ if (count <= 32)
+ return splat_set<HeadTail<_16>>(dst, value, count);
+ if (count <= 64)
+ return splat_set<HeadTail<_32>>(dst, value, count);
+ if (count <= 128)
+ return splat_set<HeadTail<_64>>(dst, value, count);
+ return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
#endif
}
diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h
deleted file mode 100644
index ea6e1b0c112a0..0000000000000
--- a/libc/src/string/memory_utils/op_aarch64.h
+++ /dev/null
@@ -1,174 +0,0 @@
-//===-- aarch64 implementation of memory function building blocks ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides aarch64 specific building blocks to compose memory
-// functions.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
-
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-
-#include "src/string/memory_utils/op_generic.h"
-
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif //__ARM_NEON
-
-namespace __llvm_libc::aarch64 {
-
-static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
-
-namespace neon {
-
-template <size_t Size> struct BzeroCacheLine {
- static constexpr size_t SIZE = Size;
-
- static inline void block(Ptr dst, uint8_t) {
- static_assert(Size == 64);
-#if __SIZEOF_POINTER__ == 4
- asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
-#else
- asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
-#endif
- }
-
- static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- block(dst + offset, value);
- offset += SIZE;
- } while (offset < count - SIZE);
- // Unaligned store, we can't use 'dc zva' here.
- static constexpr size_t kMaxSize = kNeon ? 16 : 8;
- generic::Memset<Size, kMaxSize>::tail(dst, value, count);
- }
-};
-
-inline static bool hasZva() {
- uint64_t zva_val;
- asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
- // DC ZVA is permitted if DZP, bit [4] is zero.
- // BS, bits [3:0] is log2 of the block count in words.
- // So the next line checks whether the instruction is permitted and block
- // count is 16 words (i.e. 64 bytes).
- return (zva_val & 0b11111) == 0b00100;
-}
-
-} // namespace neon
-
-///////////////////////////////////////////////////////////////////////////////
-// Memset
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-template <size_t Size> struct Bcmp {
- static constexpr size_t SIZE = Size;
- static constexpr size_t BlockSize = 32;
-
- static const unsigned char *as_u8(CPtr ptr) {
- return reinterpret_cast<const unsigned char *>(ptr);
- }
-
- static inline BcmpReturnType block(CPtr p1, CPtr p2) {
- if constexpr (Size == BlockSize) {
- auto _p1 = as_u8(p1);
- auto _p2 = as_u8(p2);
- uint8x16_t a = vld1q_u8(_p1);
- uint8x16_t b = vld1q_u8(_p1 + 16);
- uint8x16_t n = vld1q_u8(_p2);
- uint8x16_t o = vld1q_u8(_p2 + 16);
- uint8x16_t an = veorq_u8(a, n);
- uint8x16_t bo = veorq_u8(b, o);
- // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is
- // a
diff erence between the two buffers. We reduce this value down to 4
- // bytes in two steps. First, calculate the saturated move value when
- // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
- // a single 32 bit nonzero value if a mismatch occurred.
- uint8x16_t anbo = vorrq_u8(an, bo);
- uint32x2_t anbo_reduced = vqmovn_u64(anbo);
- return vmaxv_u32(anbo_reduced);
- } else if constexpr ((Size % BlockSize) == 0) {
- for (size_t offset = 0; offset < Size; offset += BlockSize)
- if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
- return value;
- } else {
- deferred_static_assert("SIZE not implemented");
- }
- return BcmpReturnType::ZERO();
- }
-
- static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
- return block(p1 + count - SIZE, p2 + count - SIZE);
- }
-
- static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
- if constexpr (Size <= 8) {
- return generic::Bcmp<Size>::head_tail(p1, p2, count);
- } else if constexpr (Size == 16) {
- auto _p1 = as_u8(p1);
- auto _p2 = as_u8(p2);
- uint8x16_t a = vld1q_u8(_p1);
- uint8x16_t b = vld1q_u8(_p1 + count - 16);
- uint8x16_t n = vld1q_u8(_p2);
- uint8x16_t o = vld1q_u8(_p2 + count - 16);
- uint8x16_t an = veorq_s8(a, n);
- uint8x16_t bo = veorq_s8(b, o);
- // anbo = (a ^ n) | (b ^ o)
- uint8x16_t anbo = vorrq_s8(an, bo);
- uint32x2_t anbo_reduced = vqmovn_u64(anbo);
- return vmaxv_u32(anbo_reduced);
- } else if constexpr (Size == 32) {
- auto _p1 = as_u8(p1);
- auto _p2 = as_u8(p2);
- uint8x16_t a = vld1q_u8(_p1);
- uint8x16_t b = vld1q_u8(_p1 + 16);
- uint8x16_t c = vld1q_u8(_p1 + count - 16);
- uint8x16_t d = vld1q_u8(_p1 + count - 32);
- uint8x16_t n = vld1q_u8(_p2);
- uint8x16_t o = vld1q_u8(_p2 + 16);
- uint8x16_t p = vld1q_u8(_p2 + count - 16);
- uint8x16_t q = vld1q_u8(_p2 + count - 32);
- uint8x16_t an = veorq_s8(a, n);
- uint8x16_t bo = veorq_s8(b, o);
- uint8x16_t cp = veorq_s8(c, p);
- uint8x16_t dq = veorq_s8(d, q);
- uint8x16_t anbo = vorrq_s8(an, bo);
- uint8x16_t cpdq = vorrq_s8(cp, dq);
- // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to
- // a nonzero 32 bit value if a mismatch occurred.
- uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
- uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
- return vmaxv_u32(abnocpdq_reduced);
- } else {
- deferred_static_assert("SIZE not implemented");
- }
- return BcmpReturnType::ZERO();
- }
-
- static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- if (auto value = block(p1 + offset, p2 + offset))
- return value;
- offset += SIZE;
- } while (offset < count - SIZE);
- return tail(p1, p2, count);
- }
-};
-
-} // namespace __llvm_libc::aarch64
-
-#endif // LLVM_LIBC_ARCH_AARCH64
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
deleted file mode 100644
index 6b3e92ee00efe..0000000000000
--- a/libc/src/string/memory_utils/op_builtin.h
+++ /dev/null
@@ -1,148 +0,0 @@
-//===-- Implementation using the __builtin_XXX_inline ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides generic C++ building blocks to compose memory functions.
-// They rely on the compiler to generate the best possible code through the use
-// of the `__builtin_XXX_inline` builtins. These builtins are currently only
-// available in Clang.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
-
-#include "src/string/memory_utils/utils.h"
-
-namespace __llvm_libc::builtin {
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcpy
-template <size_t Size> struct Memcpy {
- static constexpr size_t SIZE = Size;
- static inline void block(Ptr __restrict dst, CPtr __restrict src) {
-#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
- return __builtin_memcpy_inline(dst, src, SIZE);
-#else
- deferred_static_assert("Missing __builtin_memcpy_inline");
- (void)dst;
- (void)src;
-#endif
- }
-
- static inline void tail(Ptr __restrict dst, CPtr __restrict src,
- size_t count) {
- block(dst + count - SIZE, src + count - SIZE);
- }
-
- static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
- size_t count) {
- block(dst, src);
- tail(dst, src, count);
- }
-
- static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
- size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- block(dst + offset, src + offset);
- offset += SIZE;
- } while (offset < count - SIZE);
- tail(dst, src, count);
- }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memset
-template <size_t Size> struct Memset {
- using ME = Memset;
- static constexpr size_t SIZE = Size;
- static inline void block(Ptr dst, uint8_t value) {
-#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
- __builtin_memset_inline(dst, value, Size);
-#else
- deferred_static_assert("Missing __builtin_memset_inline");
- (void)dst;
- (void)value;
-#endif
- }
-
- static inline void tail(Ptr dst, uint8_t value, size_t count) {
- block(dst + count - SIZE, value);
- }
-
- static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
- block(dst, value);
- tail(dst, value, count);
- }
-
- static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- block(dst + offset, value);
- offset += SIZE;
- } while (offset < count - SIZE);
- tail(dst, value, count);
- }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-template <size_t Size> struct Bcmp {
- using ME = Bcmp;
- static constexpr size_t SIZE = Size;
- static inline BcmpReturnType block(CPtr, CPtr) {
- deferred_static_assert("Missing __builtin_memcmp_inline");
- return BcmpReturnType::ZERO();
- }
-
- static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
- deferred_static_assert("Not implemented");
- return BcmpReturnType::ZERO();
- }
-
- static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
- deferred_static_assert("Not implemented");
- return BcmpReturnType::ZERO();
- }
-
- static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
- deferred_static_assert("Not implemented");
- return BcmpReturnType::ZERO();
- }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcmp
-template <size_t Size> struct Memcmp {
- using ME = Memcmp;
- static constexpr size_t SIZE = Size;
- static inline MemcmpReturnType block(CPtr, CPtr) {
- deferred_static_assert("Missing __builtin_memcmp_inline");
- return MemcmpReturnType::ZERO();
- }
-
- static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
- deferred_static_assert("Not implemented");
- return MemcmpReturnType::ZERO();
- }
-
- static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
- deferred_static_assert("Not implemented");
- return MemcmpReturnType::ZERO();
- }
-
- static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
- deferred_static_assert("Not implemented");
- return MemcmpReturnType::ZERO();
- }
-};
-
-} // namespace __llvm_libc::builtin
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
deleted file mode 100644
index 226d775dede88..0000000000000
--- a/libc/src/string/memory_utils/op_generic.h
+++ /dev/null
@@ -1,466 +0,0 @@
-//===-- Generic implementation of memory function building blocks ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides generic C++ building blocks.
-// Depending on the requested size, the block operation uses unsigned integral
-// types, vector types or an array of the type with the maximum size.
-//
-// The maximum size is passed as a template argument. For instance, on x86
-// platforms that only supports integral types the maximum size would be 8
-// (corresponding to uint64_t). On this platform if we request the size 32, this
-// would be treated as a cpp::array<uint64_t, 4>.
-//
-// On the other hand, if the platform is x86 with support for AVX the maximum
-// size is 32 and the operation can be handled with a single native operation.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
-
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/type_traits.h"
-#include "src/__support/endian.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/utils.h"
-
-#include <stdint.h>
-
-namespace __llvm_libc::generic {
-
-// CTPair and CTMap below implement a compile time map.
-// This is useful to map from a Size to a type handling this size.
-//
-// Example usage:
-// using MyMap = CTMap<CTPair<1, uint8_t>,
-// CTPair<2, uint16_t>,
-// >;
-// ...
-// using UInt8T = MyMap::find_type<1>;
-template <size_t I, typename T> struct CTPair {
- using type = T;
- static CTPair get_pair(cpp::integral_constant<size_t, I>) { return {}; }
-};
-template <typename... Pairs> struct CTMap : public Pairs... {
- using Pairs::get_pair...;
- template <size_t I>
- using find_type =
- typename decltype(get_pair(cpp::integral_constant<size_t, I>{}))::type;
-};
-
-// Helper to test if a type is void.
-template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
-
-// Implements load, store and splat for unsigned integral types.
-template <typename T> struct ScalarType {
- using Type = T;
- static_assert(cpp::is_integral_v<Type> && !cpp::is_signed_v<Type>);
-
- static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
- static inline void store(Ptr dst, Type value) {
- ::__llvm_libc::store<Type>(dst, value);
- }
- static inline Type splat(uint8_t value) {
- return Type(~0) / Type(0xFF) * Type(value);
- }
-};
-
-// Implements load, store and splat for vector types.
-template <size_t Size> struct VectorType {
- using Type = uint8_t __attribute__((__vector_size__(Size)));
- static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
- static inline void store(Ptr dst, Type value) {
- ::__llvm_libc::store<Type>(dst, value);
- }
- static inline Type splat(uint8_t value) {
- Type Out;
- // This for loop is optimized out for vector types.
- for (size_t i = 0; i < Size; ++i)
- Out[i] = static_cast<uint8_t>(value);
- return Out;
- }
-};
-
-// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit.
-static_assert((UINTPTR_MAX == 4294967295U) ||
- (UINTPTR_MAX == 18446744073709551615UL));
-
-// Map from sizes to structures offering static load, store and splat methods.
-// Note: On platforms lacking vector support, we use the ArrayType below and
-// decompose the operation in smaller pieces.
-using NativeTypeMap =
- CTMap<CTPair<1, ScalarType<uint8_t>>, //
- CTPair<2, ScalarType<uint16_t>>, //
- CTPair<4, ScalarType<uint32_t>>, //
-#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
- CTPair<8, ScalarType<uint64_t>>, // Not available on 32bit
-#endif //
- CTPair<16, VectorType<16>>, //
- CTPair<32, VectorType<32>>, //
- CTPair<64, VectorType<64>>>;
-
-// Implements load, store and splat for sizes not natively supported by the
-// platform. SubType is either ScalarType or VectorType.
-template <typename SubType, size_t ArraySize> struct ArrayType {
- using Type = cpp::array<typename SubType::Type, ArraySize>;
- static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
- static inline Type load(CPtr src) {
- Type Value;
- for (size_t I = 0; I < ArraySize; ++I)
- Value[I] = SubType::load(src + (I * SizeOfElement));
- return Value;
- }
- static inline void store(Ptr dst, Type Value) {
- for (size_t I = 0; I < ArraySize; ++I)
- SubType::store(dst + (I * SizeOfElement), Value[I]);
- }
- static inline Type splat(uint8_t value) {
- Type Out;
- for (size_t I = 0; I < ArraySize; ++I)
- Out[I] = SubType::splat(value);
- return Out;
- }
-};
-
-// Checks whether we should use an ArrayType.
-template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
- return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
- !is_void_v<NativeTypeMap::find_type<MaxSize>>;
-}
-
-// Compute the type to handle an operation of Size bytes knowing that the
-// underlying platform only support native types up to MaxSize bytes.
-template <size_t Size, size_t MaxSize>
-using getTypeFor = cpp::conditional_t<
- useArrayType<Size, MaxSize>(),
- ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
- NativeTypeMap::find_type<Size>>;
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcpy
-// When building with clang we can delegate to the builtin implementation.
-///////////////////////////////////////////////////////////////////////////////
-
-template <size_t Size> using Memcpy = builtin::Memcpy<Size>;
-
-///////////////////////////////////////////////////////////////////////////////
-// Memset
-// The MaxSize template argument gives the maximum size handled natively by the
-// platform. For instance on x86 with AVX support this would be 32. If a size
-// greater than MaxSize is requested we break the operation down in smaller
-// pieces of size MaxSize.
-///////////////////////////////////////////////////////////////////////////////
-template <size_t Size, size_t MaxSize> struct Memset {
- static_assert(is_power2(MaxSize));
- static constexpr size_t SIZE = Size;
-
- static inline void block(Ptr dst, uint8_t value) {
- if constexpr (Size == 3) {
- Memset<1, MaxSize>::block(dst + 2, value);
- Memset<2, MaxSize>::block(dst, value);
- } else {
- using T = getTypeFor<Size, MaxSize>;
- if constexpr (is_void_v<T>) {
- deferred_static_assert("Unimplemented Size");
- } else {
- T::store(dst, T::splat(value));
- }
- }
- }
-
- static inline void tail(Ptr dst, uint8_t value, size_t count) {
- block(dst + count - SIZE, value);
- }
-
- static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
- block(dst, value);
- tail(dst, value, count);
- }
-
- static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
- static_assert(SIZE > 1);
- size_t offset = 0;
- do {
- block(dst + offset, value);
- offset += SIZE;
- } while (offset < count - SIZE);
- tail(dst, value, count);
- }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-///////////////////////////////////////////////////////////////////////////////
-template <size_t Size> struct Bcmp {
- static constexpr size_t SIZE = Size;
- static constexpr size_t MaxSize = 8;
-
- template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
- return load<T>(p1) ^ load<T>(p2);
- }
-
- template <typename T>
- static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
- return load<T>(p1) != load<T>(p2);
- }
-
- static inline BcmpReturnType block(CPtr p1, CPtr p2) {
- static constexpr size_t MaxSize = 8;
- if constexpr (Size == 1) {
- return load_xor<uint8_t>(p1, p2);
- } else if constexpr (Size == 2) {
- return load_xor<uint16_t>(p1, p2);
- } else if constexpr (Size == 4) {
- return load_xor<uint32_t>(p1, p2);
- } else if constexpr (Size == 8) {
- return load_not_equal<uint64_t>(p1, p2);
- } else if constexpr (useArrayType<Size, MaxSize>()) {
- for (size_t offset = 0; offset < Size; offset += MaxSize)
- if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
- return value;
- } else {
- deferred_static_assert("Unimplemented Size");
- }
- return BcmpReturnType::ZERO();
- }
-
- static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
- return block(p1 + count - SIZE, p2 + count - SIZE);
- }
-
- static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
- return block(p1, p2) | tail(p1, p2, count);
- }
-
- static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- if (auto value = block(p1 + offset, p2 + offset))
- return value;
- offset += SIZE;
- } while (offset < count - SIZE);
- return tail(p1, p2, count);
- }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcmp
-///////////////////////////////////////////////////////////////////////////////
-template <size_t Size> struct Memcmp {
- static constexpr size_t SIZE = Size;
- static constexpr size_t MaxSize = 8;
-
- template <typename T> static inline T load_be(CPtr ptr) {
- return Endian::to_big_endian(load<T>(ptr));
- }
-
- template <typename T>
- static inline MemcmpReturnType load_be_
diff (CPtr p1, CPtr p2) {
- return load_be<T>(p1) - load_be<T>(p2);
- }
-
- template <typename T>
- static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
- const auto la = load_be<T>(p1);
- const auto lb = load_be<T>(p2);
- return la > lb ? 1 : la < lb ? -1 : 0;
- }
-
- static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
- if constexpr (Size == 1) {
- return load_be_
diff <uint8_t>(p1, p2);
- } else if constexpr (Size == 2) {
- return load_be_
diff <uint16_t>(p1, p2);
- } else if constexpr (Size == 4) {
- return load_be_cmp<uint32_t>(p1, p2);
- } else if constexpr (Size == 8) {
- return load_be_cmp<uint64_t>(p1, p2);
- } else if constexpr (useArrayType<Size, MaxSize>()) {
- for (size_t offset = 0; offset < Size; offset += MaxSize)
- if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
- return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
- return MemcmpReturnType::ZERO();
- } else if constexpr (Size == 3) {
- if (auto value = Memcmp<2>::block(p1, p2))
- return value;
- return Memcmp<1>::block(p1 + 2, p2 + 2);
- } else {
- deferred_static_assert("Unimplemented Size");
- }
- }
-
- static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
- return block(p1 + count - SIZE, p2 + count - SIZE);
- }
-
- static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
- if (auto value = block(p1, p2))
- return value;
- return tail(p1, p2, count);
- }
-
- static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- if (auto value = block(p1 + offset, p2 + offset))
- return value;
- offset += SIZE;
- } while (offset < count - SIZE);
- return tail(p1, p2, count);
- }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Memmove
-///////////////////////////////////////////////////////////////////////////////
-
-template <size_t Size, size_t MaxSize> struct Memmove {
- static_assert(is_power2(MaxSize));
- using T = getTypeFor<Size, MaxSize>;
- static constexpr size_t SIZE = Size;
-
- static inline void block(Ptr dst, CPtr src) {
- if constexpr (is_void_v<T>) {
- deferred_static_assert("Unimplemented Size");
- } else {
- T::store(dst, T::load(src));
- }
- }
-
- static inline void head_tail(Ptr dst, CPtr src, size_t count) {
- const size_t offset = count - Size;
- if constexpr (is_void_v<T>) {
- deferred_static_assert("Unimplemented Size");
- } else {
- // The load and store operations can be performed in any order as long as
- // they are not interleaved. More investigations are needed to determine
- // the best order.
- const auto head = T::load(src);
- const auto tail = T::load(src + offset);
- T::store(dst, head);
- T::store(dst + offset, tail);
- }
- }
-
- // Align forward suitable when dst < src. The alignment is performed with
- // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
- //
- // e.g. Moving two bytes forward, we make sure src is aligned.
- // [ | | | | ]
- // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
- // [____LLLLLLLL_____________________]
- // [___________LLLLLLLA______________]
- // [_SSSSSSSS________________________]
- // [________SSSSSSSS_________________]
- //
- // e.g. Moving two bytes forward, we make sure dst is aligned.
- // [ | | | | ]
- // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
- // [____LLLLLLLL_____________________]
- // [______LLLLLLLL___________________]
- // [_SSSSSSSS________________________]
- // [___SSSSSSSA______________________]
- template <Arg AlignOn>
- static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
- Ptr prev_dst = dst;
- CPtr prev_src = src;
- size_t prev_count = count;
- align_to_next_boundary<Size, AlignOn>(dst, src, count);
- adjust(Size, dst, src, count);
- head_tail(prev_dst, prev_src, prev_count - count);
- }
-
- // Align backward suitable when dst > src. The alignment is performed with
- // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
- //
- // e.g. Moving two bytes backward, we make sure src is aligned.
- // [ | | | | ]
- // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
- // [ _________________ALLLLLLL_______]
- // [ ___________________LLLLLLLL_____]
- // [____________________SSSSSSSS_____]
- // [______________________SSSSSSSS___]
- //
- // e.g. Moving two bytes backward, we make sure dst is aligned.
- // [ | | | | ]
- // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
- // [ _______________LLLLLLLL_________]
- // [ ___________________LLLLLLLL_____]
- // [__________________ASSSSSSS_______]
- // [______________________SSSSSSSS___]
- template <Arg AlignOn>
- static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
- Ptr headtail_dst = dst + count;
- CPtr headtail_src = src + count;
- size_t headtail_size = 0;
- align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
- headtail_size);
- adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
- head_tail(headtail_dst, headtail_src, headtail_size);
- count -= headtail_size;
- }
-
- // Move forward suitable when dst < src. We load the tail bytes before
- // handling the loop.
- //
- // e.g. Moving two bytes
- // [ | | | | |]
- // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
- // [_________________________LLLLLLLL___]
- // [___LLLLLLLL_________________________]
- // [_SSSSSSSS___________________________]
- // [___________LLLLLLLL_________________]
- // [_________SSSSSSSS___________________]
- // [___________________LLLLLLLL_________]
- // [_________________SSSSSSSS___________]
- // [_______________________SSSSSSSS_____]
- static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
- static_assert(Size > 1);
- const size_t tail_offset = count - Size;
- const auto tail_value = T::load(src + tail_offset);
- size_t offset = 0;
-#pragma nounroll
- do {
- block(dst + offset, src + offset);
- offset += Size;
- } while (offset < count - Size);
- T::store(dst + tail_offset, tail_value);
- }
-
- // Move backward suitable when dst > src. We load the head bytes before
- // handling the loop.
- //
- // e.g. Moving two bytes
- // [ | | | | |]
- // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
- // [___LLLLLLLL_________________________]
- // [_________________________LLLLLLLL___]
- // [___________________________SSSSSSSS_]
- // [_________________LLLLLLLL___________]
- // [___________________SSSSSSSS_________]
- // [_________LLLLLLLL___________________]
- // [___________SSSSSSSS_________________]
- // [_____SSSSSSSS_______________________]
- static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
- static_assert(Size > 1);
- const auto head_value = T::load(src);
- ptr
diff _t offset = count - Size;
-#pragma nounroll
- do {
- block(dst + offset, src + offset);
- offset -= Size;
- } while (offset >= 0);
- T::store(dst, head_value);
- }
-};
-
-} // namespace __llvm_libc::generic
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
deleted file mode 100644
index 96847b211b8a4..0000000000000
--- a/libc/src/string/memory_utils/op_x86.h
+++ /dev/null
@@ -1,219 +0,0 @@
-//===-- x86 implementation of memory function building blocks -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides x86 specific building blocks to compose memory functions.
-//
-//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
-
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_X86_64)
-
-#include "src/__support/common.h"
-#include "src/string/memory_utils/op_builtin.h"
-#include "src/string/memory_utils/op_generic.h"
-
-#ifdef __SSE2__
-#include <immintrin.h>
-#else
-// Define fake functions to prevent the compiler from failing on undefined
-// functions in case SSE2 is not present.
-#define _mm512_cmpneq_epi8_mask(A, B) 0
-#define _mm_movemask_epi8(A) 0
-#define _mm256_movemask_epi8(A) 0
-#endif // __SSE2__
-
-namespace __llvm_libc::x86 {
-
-// A set of constants to check compile time features.
-static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
-static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
-static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
-static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
-static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcpy repmovsb implementation
-struct Memcpy {
- static void repmovsb(char *dst, const char *src, size_t count) {
- asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
- }
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Bcmp
-
-// Base implementation for the Bcmp specializations.
-// - BlockSize is either 16, 32 or 64 depending on the available compile time
-// features, it is used to switch between "single native operation" or a
-// "sequence of native operations".
-// - BlockBcmp is the function that implements the bcmp logic.
-template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
- static inline BcmpReturnType block(CPtr p1, CPtr p2) {
- if constexpr (Size == BlockSize) {
- return BlockBcmp(p1, p2);
- } else if constexpr (Size % BlockSize == 0) {
- for (size_t offset = 0; offset < Size; offset += BlockSize)
- if (auto value = BlockBcmp(p1 + offset, p2 + offset))
- return value;
- } else {
- deferred_static_assert("SIZE not implemented");
- }
- return BcmpReturnType::ZERO();
- }
-
- static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
- return block(p1 + count - Size, p2 + count - Size);
- }
-
- static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
- return block(p1, p2) | tail(p1, p2, count);
- }
-
- static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- if (auto value = block(p1 + offset, p2 + offset))
- return value;
- offset += Size;
- } while (offset < count - Size);
- return tail(p1, p2, count);
- }
-};
-
-namespace sse2 {
-static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
- using T = char __attribute__((__vector_size__(16)));
- // A mask indicating which bytes
diff er after loading 16 bytes from p1 and p2.
- const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
- return static_cast<uint32_t>(mask);
-}
-template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
-} // namespace sse2
-
-namespace avx2 {
-static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
- using T = char __attribute__((__vector_size__(32)));
- // A mask indicating which bytes
diff er after loading 32 bytes from p1 and p2.
- const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
- // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
- // mask.
- return static_cast<uint32_t>(mask);
-}
-template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
-} // namespace avx2
-
-namespace avx512bw {
-static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
- using T = char __attribute__((__vector_size__(64)));
- // A mask indicating which bytes
diff er after loading 64 bytes from p1 and p2.
- const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
- const bool mask_is_set = mask != 0;
- return static_cast<uint32_t>(mask_is_set);
-}
-template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
-} // namespace avx512bw
-
-// Assuming that the mask is non zero, the index of the first mismatching byte
-// is the number of trailing zeros in the mask. Trailing zeros and not leading
-// zeros because the x86 architecture is little endian.
-static inline MemcmpReturnType char_
diff _no_zero(CPtr p1, CPtr p2,
- uint64_t mask) {
- const size_t
diff _index = __builtin_ctzll(mask);
- const int16_t ca = p1[
diff _index];
- const int16_t cb = p2[
diff _index];
- return ca - cb;
-}
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcmp
-
-// Base implementation for the Memcmp specializations.
-// - BlockSize is either 16, 32 or 64 depending on the available compile time
-// features, it is used to switch between "single native operation" or a
-// "sequence of native operations".
-// - BlockMemcmp is the function that implements the memcmp logic.
-// - BlockBcmp is the function that implements the bcmp logic.
-template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
-struct MemcmpImpl {
- static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
- if constexpr (Size == BlockSize) {
- return BlockMemcmp(p1, p2);
- } else if constexpr (Size % BlockSize == 0) {
- for (size_t offset = 0; offset < Size; offset += BlockSize)
- if (auto value = BlockBcmp(p1 + offset, p2 + offset))
- return BlockMemcmp(p1 + offset, p2 + offset);
- } else {
- deferred_static_assert("SIZE not implemented");
- }
- return MemcmpReturnType::ZERO();
- }
-
- static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
- return block(p1 + count - Size, p2 + count - Size);
- }
-
- static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
- if (auto value = block(p1, p2))
- return value;
- return tail(p1, p2, count);
- }
-
- static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
- static_assert(Size > 1);
- size_t offset = 0;
- do {
- if (auto value = block(p1 + offset, p2 + offset))
- return value;
- offset += Size;
- } while (offset < count - Size);
- return tail(p1, p2, count);
- }
-};
-
-namespace sse2 {
-static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
- using T = char __attribute__((__vector_size__(16)));
- // A mask indicating which bytes
diff er after loading 16 bytes from p1 and p2.
- if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
- return char_
diff _no_zero(p1, p2, mask);
- return MemcmpReturnType::ZERO();
-}
-template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
-} // namespace sse2
-
-namespace avx2 {
-static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
- using T = char __attribute__((__vector_size__(32)));
- // A mask indicating which bytes
diff er after loading 32 bytes from p1 and p2.
- if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
- return char_
diff _no_zero(p1, p2, mask);
- return MemcmpReturnType::ZERO();
-}
-template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
-} // namespace avx2
-
-namespace avx512bw {
-static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
- using T = char __attribute__((__vector_size__(64)));
- // A mask indicating which bytes
diff er after loading 64 bytes from p1 and p2.
- if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
- return char_
diff _no_zero(p1, p2, mask);
- return MemcmpReturnType::ZERO();
-}
-template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
-} // namespace avx512bw
-
-} // namespace __llvm_libc::x86
-
-#endif // LLVM_LIBC_ARCH_X86_64
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 9d1321f99b83a..d915835e38d82 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -9,8 +9,19 @@
#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
#define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
-#include "src/__support/CPP/bit.h"
-#include "src/__support/CPP/type_traits.h"
+#include "src/__support/architectures.h"
+
+// Cache line sizes for ARM: These values are not strictly correct since
+// cache line sizes depend on implementations, not architectures. There
+// are even implementations with cache line sizes configurable at boot
+// time.
+#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86)
+#define LLVM_LIBC_CACHELINE_SIZE 64
+#elif defined(LLVM_LIBC_ARCH_ARM)
+#define LLVM_LIBC_CACHELINE_SIZE 32
+#else
+#error "Unsupported platform for memory functions."
+#endif
#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
@@ -51,46 +62,32 @@ static constexpr size_t ge_power2(size_t value) {
return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
}
-// Returns the number of bytes to substract from ptr to get to the previous
-// multiple of alignment. If ptr is already aligned returns 0.
-template <size_t alignment> uintptr_t distance_to_align_down(const void *ptr) {
+template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}
-// Returns the number of bytes to add to ptr to get to the next multiple of
-// alignment. If ptr is already aligned returns 0.
-template <size_t alignment> uintptr_t distance_to_align_up(const void *ptr) {
+template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
static_assert(is_power2(alignment), "alignment must be a power of 2");
// The logic is not straightforward and involves unsigned modulo arithmetic
// but the generated code is as fast as it can be.
return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
}
-// Returns the number of bytes to add to ptr to get to the next multiple of
-// alignment. If ptr is already aligned returns alignment.
-template <size_t alignment>
-uintptr_t distance_to_next_aligned(const void *ptr) {
- return alignment - distance_to_align_down<alignment>(ptr);
+// Returns the offset from `ptr` to the next cache line.
+static inline intptr_t offset_to_next_cache_line(const void *ptr) {
+ return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
}
-// Returns the same pointer but notifies the compiler that it is aligned.
template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
}
-
#if defined __has_builtin
#if __has_builtin(__builtin_memcpy_inline)
#define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
#endif
#endif
-#if defined __has_builtin
-#if __has_builtin(__builtin_memset_inline)
-#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
-#endif
-#endif
-
// Performs a constant count copy.
template <size_t Size>
static inline void memcpy_inline(void *__restrict dst,
@@ -106,56 +103,28 @@ static inline void memcpy_inline(void *__restrict dst,
using Ptr = char *; // Pointer to raw data.
using CPtr = const char *; // Const pointer to raw data.
-// This type makes sure that we don't accidentally promote an integral type to
-// another one. It is only constructible from the exact T type.
-template <typename T> struct StrictIntegralType {
- static_assert(cpp::is_integral_v<T>);
-
- // Can only be constructed from a T.
- template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
- StrictIntegralType(U value) : value(value) {}
-
- // Allows using the type in an if statement.
- explicit operator bool() const { return value; }
-
- // If type is unsigned (bcmp) we allow bitwise OR operations.
- StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
- static_assert(!cpp::is_signed_v<T>);
- return value | Rhs.value;
- }
-
- // For interation with the C API we allow explicit conversion back to the
- // `int` type.
- explicit operator int() const {
- // bit_cast makes sure that T and int have the same size.
- return cpp::bit_cast<int>(value);
- }
-
- // Helper to get the zero value.
- static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }
-
-private:
- T value;
-};
-
-using MemcmpReturnType = StrictIntegralType<int32_t>;
-using BcmpReturnType = StrictIntegralType<uint32_t>;
-
-// Loads bytes from memory (possibly unaligned) and materializes them as
-// type.
+// Loads bytes from memory (possibly unaligned) and materializes them as type.
template <typename T> static inline T load(CPtr ptr) {
T Out;
memcpy_inline<sizeof(T)>(&Out, ptr);
return Out;
}
-// Stores a value of type T in memory (possibly unaligned).
+// Stores a value of type T in memory (possibly unaligned)
template <typename T> static inline void store(Ptr ptr, T value) {
memcpy_inline<sizeof(T)>(ptr, &value);
}
-// Advances the pointers p1 and p2 by offset bytes and decrease count by the
-// same amount.
+// For an operation like memset that operates on a pointer and a count, advances
+// the pointer by offset bytes and decrease count by the same amount.
+static inline void adjust(ptr
diff _t offset, Ptr &ptr, size_t &count) {
+ ptr += offset;
+ count -= offset;
+}
+
+// For an operation like memcpy or memcmp that operates on two pointers and a
+// count, advances the pointers by offset bytes and decrease count by the same
+// amount.
template <typename T1, typename T2>
static inline void adjust(ptr
diff _t offset, T1 *__restrict &p1,
T2 *__restrict &p2, size_t &count) {
@@ -164,37 +133,31 @@ static inline void adjust(ptr
diff _t offset, T1 *__restrict &p1,
count -= offset;
}
-// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
-// and decrease count by the same amount.
+// For an operation like memset that operates on a pointer and a count, advances
+// the pointer so it is aligned to SIZE bytes and decrease count by the same
+// amount.
// We make sure the compiler knows about the adjusted pointer alignment.
-template <size_t SIZE, typename T1, typename T2>
-void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
- size_t &count) {
- adjust(distance_to_next_aligned<SIZE>(p1), p1, p2, count);
- p1 = assume_aligned<SIZE>(p1);
-}
-
-// Same as align_p1_to_next_boundary above but with a single pointer instead.
-template <size_t SIZE, typename T1>
-void align_to_next_boundary(T1 *&p1, size_t &count) {
- CPtr dummy;
- align_p1_to_next_boundary<SIZE>(p1, dummy, count);
+template <size_t SIZE> void align(Ptr &ptr, size_t &count) {
+ adjust(offset_to_next_aligned<SIZE>(ptr), ptr, count);
+ ptr = assume_aligned<SIZE>(ptr);
}
-// An enum class that discriminates between the first and second pointer.
-enum class Arg { P1, P2, Dst = P1, Src = P2 };
-
-// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
-// Precondition: &p1 != &p2
+// For an operation like memcpy or memcmp that operates on two pointers and a
+// count, advances the pointers so one of them gets aligned to SIZE bytes and
+// decrease count by the same amount.
+// We make sure the compiler knows about the adjusted pointer alignment.
+enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
-void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
- size_t &count) {
- if constexpr (AlignOn == Arg::P1)
- align_p1_to_next_boundary<SIZE>(p1, p2, count);
- else if constexpr (AlignOn == Arg::P2)
- align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
- else
- deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
+void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) {
+ if constexpr (AlignOn == Arg::_1) {
+ adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
+ p1 = assume_aligned<SIZE>(p1);
+ } else if constexpr (AlignOn == Arg::_2) {
+ adjust(offset_to_next_aligned<SIZE>(p2), p1, p2, count);
+ p2 = assume_aligned<SIZE>(p2);
+ } else {
+ deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2");
+ }
}
} // namespace __llvm_libc
diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp
index 1b492b5730204..549c0742dec75 100644
--- a/libc/src/string/memset.cpp
+++ b/libc/src/string/memset.cpp
@@ -13,8 +13,8 @@
namespace __llvm_libc {
LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
- inline_memset(reinterpret_cast<char *>(dst), static_cast<uint8_t>(value),
- count);
+ inline_memset(reinterpret_cast<char *>(dst),
+ static_cast<unsigned char>(value), count);
return dst;
}
diff --git a/libc/test/src/string/bcmp_test.cpp b/libc/test/src/string/bcmp_test.cpp
index 8f0fe5262f22d..19df7ad2637a2 100644
--- a/libc/test/src/string/bcmp_test.cpp
+++ b/libc/test/src/string/bcmp_test.cpp
@@ -12,25 +12,25 @@
TEST(LlvmLibcBcmpTest, CmpZeroByte) {
const char *lhs = "ab";
const char *rhs = "bc";
- ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
+ EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
}
TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) {
const char *lhs = "ab";
const char *rhs = "ab";
- ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+ EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}
TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) {
const char *lhs = "ab";
const char *rhs = "ac";
- ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+ EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}
TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) {
const char *lhs = "ac";
const char *rhs = "ab";
- ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+ EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
}
TEST(LlvmLibcBcmpTest, Sweep) {
@@ -46,13 +46,13 @@ TEST(LlvmLibcBcmpTest, Sweep) {
reset(lhs);
reset(rhs);
for (size_t i = 0; i < K_MAX_SIZE; ++i)
- ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
+ EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
reset(lhs);
reset(rhs);
for (size_t i = 0; i < K_MAX_SIZE; ++i) {
rhs[i] = 'b';
- ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
+ EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
rhs[i] = 'a';
}
}
diff --git a/libc/test/src/string/memmove_test.cpp b/libc/test/src/string/memmove_test.cpp
index 451ccdb0a89b9..26b4d9e9d675d 100644
--- a/libc/test/src/string/memmove_test.cpp
+++ b/libc/test/src/string/memmove_test.cpp
@@ -20,7 +20,7 @@ TEST(LlvmLibcMemmoveTest, MoveZeroByte) {
void *const Dst = Buffer;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0);
EXPECT_EQ(Ret, Dst);
- ASSERT_MEM_EQ(Buffer, Expected);
+ EXPECT_MEM_EQ(Buffer, Expected);
}
TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
@@ -29,7 +29,7 @@ TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
void *const Dst = Buffer;
void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1);
EXPECT_EQ(Ret, Dst);
- ASSERT_MEM_EQ(Buffer, Expected);
+ EXPECT_MEM_EQ(Buffer, Expected);
}
TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
@@ -40,7 +40,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
void *const Dst = Buffer + 1;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2);
EXPECT_EQ(Ret, Dst);
- ASSERT_MEM_EQ(Buffer, Expected);
+ EXPECT_MEM_EQ(Buffer, Expected);
}
TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
@@ -49,7 +49,7 @@ TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
void *const Dst = Buffer + 2;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2);
EXPECT_EQ(Ret, Dst);
- ASSERT_MEM_EQ(Buffer, Expected);
+ EXPECT_MEM_EQ(Buffer, Expected);
}
// e.g. `Dst` follow `src`.
@@ -62,7 +62,7 @@ TEST(LlvmLibcMemmoveTest, SrcFollowDst) {
void *const Dst = Buffer + 1;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1);
EXPECT_EQ(Ret, Dst);
- ASSERT_MEM_EQ(Buffer, Expected);
+ EXPECT_MEM_EQ(Buffer, Expected);
}
TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
@@ -71,7 +71,7 @@ TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
void *const Dst = Buffer + 2;
void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1);
EXPECT_EQ(Ret, Dst);
- ASSERT_MEM_EQ(Buffer, Expected);
+ EXPECT_MEM_EQ(Buffer, Expected);
}
static constexpr int kMaxSize = 512;
@@ -106,7 +106,7 @@ TEST(LlvmLibcMemmoveTest, Thorough) {
void *const Ret =
__llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size);
EXPECT_EQ(Ret, Dst);
- ASSERT_MEM_EQ(Buffer, Expected);
+ EXPECT_MEM_EQ(Buffer, Expected);
}
}
}
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
index d54f8457e5148..8f926273de5d5 100644
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -3,6 +3,8 @@ add_libc_unittest(
SUITE
libc_string_unittests
SRCS
+ elements_test.cpp
+ memory_access_test.cpp
utils_test.cpp
COMPILE_OPTIONS
${LIBC_COMPILE_OPTIONS_NATIVE}
diff --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp
new file mode 100644
index 0000000000000..218700137c111
--- /dev/null
+++ b/libc/test/src/string/memory_utils/elements_test.cpp
@@ -0,0 +1,137 @@
+//===-- Unittests for memory_utils ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/span.h"
+#include "src/string/memory_utils/elements.h"
+#include "utils/UnitTest/Test.h"
+
+namespace __llvm_libc {
+
+// Registering Types
+using FixedSizeTypes = testing::TypeList<
+#if defined(__SSE2__)
+ x86::Vector128, //
+#endif // __SSE2__
+#if defined(__AVX2__)
+ x86::Vector256, //
+#endif // __AVX2__
+#if defined(__AVX512F__) and defined(__AVX512BW__)
+ x86::Vector512, //
+#endif // defined(__AVX512F__) and defined(__AVX512BW__)
+ scalar::UINT8, //
+ scalar::UINT16, //
+ scalar::UINT32, //
+ scalar::UINT64, //
+ Repeated<scalar::UINT64, 2>, //
+ Repeated<scalar::UINT64, 4>, //
+ Repeated<scalar::UINT64, 8>, //
+ Repeated<scalar::UINT64, 16>, //
+ Repeated<scalar::UINT64, 32>, //
+ Chained<scalar::UINT16, scalar::UINT8>, //
+ Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, //
+ builtin::_1, //
+ builtin::_2, //
+ builtin::_3, //
+ builtin::_4, //
+ builtin::_8 //
+ >;
+
+char GetRandomChar() {
+ static constexpr const uint64_t a = 1103515245;
+ static constexpr const uint64_t c = 12345;
+ static constexpr const uint64_t m = 1ULL << 31;
+ static uint64_t seed = 123456789;
+ seed = (a * seed + c) % m;
+ return seed;
+}
+
+void Randomize(cpp::span<char> buffer) {
+ for (auto ¤t : buffer)
+ current = GetRandomChar();
+}
+
+template <typename Element> using Buffer = cpp::array<char, Element::SIZE>;
+
+template <typename Element> Buffer<Element> GetRandomBuffer() {
+ Buffer<Element> buffer;
+ Randomize(buffer);
+ return buffer;
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, copy, FixedSizeTypes) {
+ Buffer<ParamType> Dst;
+ const auto buffer = GetRandomBuffer<ParamType>();
+ copy<ParamType>(Dst.data(), buffer.data());
+ for (size_t i = 0; i < ParamType::SIZE; ++i)
+ EXPECT_EQ(Dst[i], buffer[i]);
+}
+
+template <typename T> T copy(const T &Input) {
+ T Output;
+ for (size_t I = 0; I < Input.size(); ++I)
+ Output[I] = Input[I];
+ return Output;
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Move, FixedSizeTypes) {
+ constexpr size_t SIZE = ParamType::SIZE;
+ using LargeBuffer = cpp::array<char, SIZE * 2>;
+ LargeBuffer GroundTruth;
+ Randomize(GroundTruth);
+ // Forward, we move the SIZE first bytes from offset 0 to SIZE.
+ for (size_t Offset = 0; Offset < SIZE; ++Offset) {
+ LargeBuffer Buffer = copy(GroundTruth);
+ move<ParamType>(&Buffer[Offset], &Buffer[0]);
+ for (size_t I = 0; I < SIZE; ++I)
+ EXPECT_EQ(Buffer[I + Offset], GroundTruth[I]);
+ }
+ // Backward, we move the SIZE last bytes from offset 0 to SIZE.
+ for (size_t Offset = 0; Offset < SIZE; ++Offset) {
+ LargeBuffer Buffer = copy(GroundTruth);
+ move<ParamType>(&Buffer[Offset], &Buffer[SIZE]);
+ for (size_t I = 0; I < SIZE; ++I)
+ EXPECT_EQ(Buffer[I + Offset], GroundTruth[SIZE + I]);
+ }
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) {
+ const auto buffer = GetRandomBuffer<ParamType>();
+ EXPECT_TRUE(equals<ParamType>(buffer.data(), buffer.data()));
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, three_way_compare, FixedSizeTypes) {
+ Buffer<ParamType> initial;
+ for (auto &c : initial)
+ c = 5;
+
+ // Testing equality
+ EXPECT_EQ(three_way_compare<ParamType>(initial.data(), initial.data()), 0);
+
+ // Testing all mismatching positions
+ for (size_t i = 0; i < ParamType::SIZE; ++i) {
+ auto copy = initial;
+ ++copy[i]; // copy is now lexicographycally greated than initial
+ const auto *less = initial.data();
+ const auto *greater = copy.data();
+ EXPECT_LT(three_way_compare<ParamType>(less, greater), 0);
+ EXPECT_GT(three_way_compare<ParamType>(greater, less), 0);
+ }
+}
+
+TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) {
+ Buffer<ParamType> Dst;
+ const cpp::array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)};
+ for (char value : values) {
+ splat_set<ParamType>(Dst.data(), value);
+ for (size_t i = 0; i < ParamType::SIZE; ++i)
+ EXPECT_EQ(Dst[i], value);
+ }
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp
new file mode 100644
index 0000000000000..b81700f0eb255
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memory_access_test.cpp
@@ -0,0 +1,228 @@
+//===-- Unittests for memory_utils ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define LLVM_LIBC_UNITTEST_OBSERVE 1
+
+#include "src/__support/CPP/array.h"
+#include "src/string/memory_utils/elements.h"
+#include "utils/UnitTest/Test.h"
+
+#include <stdio.h>
+#include <string.h>
+
+namespace __llvm_libc {
+
+static constexpr const size_t kMaxBuffer = 32;
+
+struct BufferAccess : cpp::array<char, kMaxBuffer + 1> {
+ BufferAccess() { Reset(); }
+ void Reset() {
+ for (auto &value : *this)
+ value = '0';
+ this->operator[](kMaxBuffer) = '\0';
+ }
+ void Touch(ptr
diff _t offset, size_t size) {
+ if (offset < 0)
+ return;
+ for (size_t i = 0; i < size; ++i)
+ ++(*this)[offset + i];
+ }
+ operator const char *() const { return this->data(); }
+};
+
+struct Buffer {
+ ptr
diff _t Offset(const char *ptr) const {
+ const bool contained = ptr >= data.begin() && ptr < data.end();
+ return contained ? ptr - data.begin() : -1;
+ }
+ void Reset() {
+ reads.Reset();
+ writes.Reset();
+ }
+ cpp::array<char, kMaxBuffer> data;
+ BufferAccess __attribute__((aligned(64))) reads;
+ BufferAccess __attribute__((aligned(64))) writes;
+};
+
+struct MemoryAccessObserver {
+ void ObserveRead(const char *ptr, size_t size) {
+ Buffer1.reads.Touch(Buffer1.Offset(ptr), size);
+ Buffer2.reads.Touch(Buffer2.Offset(ptr), size);
+ }
+
+ void ObserveWrite(const char *ptr, size_t size) {
+ Buffer1.writes.Touch(Buffer1.Offset(ptr), size);
+ Buffer2.writes.Touch(Buffer2.Offset(ptr), size);
+ }
+
+ void Reset() {
+ Buffer1.Reset();
+ Buffer2.Reset();
+ }
+
+ Buffer Buffer1;
+ Buffer Buffer2;
+};
+
+MemoryAccessObserver Observer;
+
+template <size_t Size> struct TestingElement {
+ static constexpr size_t SIZE = Size;
+
+ static void copy(char *__restrict dst, const char *__restrict src) {
+ Observer.ObserveRead(src, SIZE);
+ Observer.ObserveWrite(dst, SIZE);
+ }
+
+ static bool equals(const char *lhs, const char *rhs) {
+ Observer.ObserveRead(lhs, SIZE);
+ Observer.ObserveRead(rhs, SIZE);
+ return true;
+ }
+
+ static int three_way_compare(const char *lhs, const char *rhs) {
+ Observer.ObserveRead(lhs, SIZE);
+ Observer.ObserveRead(rhs, SIZE);
+ return 0;
+ }
+
+ static void splat_set(char *dst, const unsigned char value) {
+ Observer.ObserveWrite(dst, SIZE);
+ }
+};
+
+using Types = testing::TypeList<
+ TestingElement<1>, // 1 Byte
+ TestingElement<2>, // 2 Bytes
+ TestingElement<4>, // 4 Bytes
+ Repeated<TestingElement<2>, 3>, // 6 Bytes
+ Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes
+ >;
+
+struct LlvmLibcTestAccessBase : public testing::Test {
+
+ template <typename HigherOrder, size_t Size, size_t Offset = 0>
+ void checkOperations(const BufferAccess &expected) {
+ static const BufferAccess untouched;
+
+ Observer.Reset();
+ HigherOrder::copy(dst_ptr() + Offset, src_ptr() + Offset, Size);
+ ASSERT_STREQ(src().writes, untouched);
+ ASSERT_STREQ(dst().reads, untouched);
+ ASSERT_STREQ(src().reads, expected);
+ ASSERT_STREQ(dst().writes, expected);
+ Observer.Reset();
+ HigherOrder::equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
+ ASSERT_STREQ(lhs().writes, untouched);
+ ASSERT_STREQ(rhs().writes, untouched);
+ ASSERT_STREQ(lhs().reads, expected);
+ ASSERT_STREQ(rhs().reads, expected);
+ Observer.Reset();
+ HigherOrder::three_way_compare(lhs_ptr() + Offset, rhs_ptr() + Offset,
+ Size);
+ ASSERT_STREQ(lhs().writes, untouched);
+ ASSERT_STREQ(rhs().writes, untouched);
+ ASSERT_STREQ(lhs().reads, expected);
+ ASSERT_STREQ(rhs().reads, expected);
+ Observer.Reset();
+ HigherOrder::splat_set(dst_ptr() + Offset, 5, Size);
+ ASSERT_STREQ(src().reads, untouched);
+ ASSERT_STREQ(src().writes, untouched);
+ ASSERT_STREQ(dst().reads, untouched);
+ ASSERT_STREQ(dst().writes, expected);
+ }
+
+ void checkMaxAccess(const BufferAccess &expected, int max) {
+ for (size_t i = 0; i < kMaxBuffer; ++i) {
+ int value = (int)expected[i] - '0';
+ ASSERT_GE(value, 0);
+ ASSERT_LE(value, max);
+ }
+ }
+
+private:
+ const Buffer &lhs() const { return Observer.Buffer1; }
+ const Buffer &rhs() const { return Observer.Buffer2; }
+ const Buffer &src() const { return Observer.Buffer2; }
+ const Buffer &dst() const { return Observer.Buffer1; }
+ Buffer &dst() { return Observer.Buffer1; }
+
+ char *dst_ptr() { return dst().data.begin(); }
+ const char *src_ptr() { return src().data.begin(); }
+ const char *lhs_ptr() { return lhs().data.begin(); }
+ const char *rhs_ptr() { return rhs().data.begin(); }
+};
+
+template <typename ParamType>
+struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase {
+
+ void TearDown() override {
+ static constexpr size_t Size = 10;
+
+ BufferAccess expected;
+ expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
+
+ checkMaxAccess(expected, 1);
+ checkOperations<Tail<ParamType>, Size>(expected);
+ }
+};
+TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase {
+ void TearDown() override {
+ static constexpr size_t Size = 10;
+
+ BufferAccess expected;
+ expected.Touch(0, ParamType::SIZE);
+ expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
+
+ checkMaxAccess(expected, 2);
+ checkOperations<HeadTail<ParamType>, Size>(expected);
+ }
+};
+TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase {
+ void TearDown() override {
+ static constexpr size_t Size = 20;
+
+ BufferAccess expected;
+ for (size_t i = 0; i < Size - ParamType::SIZE; i += ParamType::SIZE)
+ expected.Touch(i, ParamType::SIZE);
+ expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
+
+ checkMaxAccess(expected, 2);
+ checkOperations<Loop<ParamType>, Size>(expected);
+ }
+};
+TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {}
+
+template <typename ParamType>
+struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase {
+ void TearDown() override {
+ static constexpr size_t Size = 10;
+ static constexpr size_t Offset = 2;
+ using AlignmentT = TestingElement<4>;
+
+ BufferAccess expected;
+ expected.Touch(Offset, AlignmentT::SIZE);
+ expected.Touch(AlignmentT::SIZE, ParamType::SIZE);
+ expected.Touch(Offset + Size - ParamType::SIZE, ParamType::SIZE);
+
+ checkMaxAccess(expected, 3);
+ checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size,
+ Offset>(expected);
+ checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size,
+ Offset>(expected);
+ }
+};
+TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
index 5c7920c4960b6..a20c0900b7234 100644
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -72,41 +72,55 @@ TEST(LlvmLibcUtilsTest, GEPowerOf2) {
EXPECT_EQ(ge_power2(i), kExpectedValues[i]);
}
-using UINT = uintptr_t;
+using I = intptr_t;
// Converts an offset into a pointer.
const void *forge(size_t offset) {
return reinterpret_cast<const void *>(offset);
}
-TEST(LlvmLibcUtilsTest, DistanceToNextAligned) {
- EXPECT_EQ(distance_to_next_aligned<16>(forge(0)), UINT(16));
- EXPECT_EQ(distance_to_next_aligned<16>(forge(1)), UINT(15));
- EXPECT_EQ(distance_to_next_aligned<16>(forge(16)), UINT(16));
- EXPECT_EQ(distance_to_next_aligned<16>(forge(15)), UINT(1));
- EXPECT_EQ(distance_to_next_aligned<32>(forge(16)), UINT(16));
+TEST(LlvmLibcUtilsTest, OffsetToNextAligned) {
+ EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0));
+ EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15));
+ EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0));
+ EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1));
+ EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
}
-TEST(LlvmLibcUtilsTest, DistanceToAlignUp) {
- EXPECT_EQ(distance_to_align_up<16>(forge(0)), UINT(0));
- EXPECT_EQ(distance_to_align_up<16>(forge(1)), UINT(15));
- EXPECT_EQ(distance_to_align_up<16>(forge(16)), UINT(0));
- EXPECT_EQ(distance_to_align_up<16>(forge(15)), UINT(1));
- EXPECT_EQ(distance_to_align_up<32>(forge(16)), UINT(16));
+TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) {
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
+ EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
+ EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
}
-TEST(LlvmLibcUtilsTest, DistanceToAlignDown) {
- EXPECT_EQ(distance_to_align_down<16>(forge(0)), UINT(0));
- EXPECT_EQ(distance_to_align_down<16>(forge(1)), UINT(1));
- EXPECT_EQ(distance_to_align_down<16>(forge(16)), UINT(0));
- EXPECT_EQ(distance_to_align_down<16>(forge(15)), UINT(15));
- EXPECT_EQ(distance_to_align_down<32>(forge(16)), UINT(16));
+TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) {
+ EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
+ EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
+ EXPECT_EQ(offset_to_next_cache_line(forge(1)),
+ I(LLVM_LIBC_CACHELINE_SIZE - 1));
+ EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0));
+ EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)),
+ I(1));
+}
+
+TEST(LlvmLibcUtilsTest, Adjust1) {
+ char a;
+ const size_t base_size = 10;
+ for (size_t I = -2; I < 2; ++I) {
+ auto *ptr = &a;
+ size_t size = base_size;
+ adjust(I, ptr, size);
+ EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I));
+ EXPECT_EQ(size, base_size - I);
+ }
}
TEST(LlvmLibcUtilsTest, Adjust2) {
char a, b;
const size_t base_size = 10;
- for (ptr
diff _t I = -2; I < 2; ++I) {
+ for (size_t I = -2; I < 2; ++I) {
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
@@ -117,6 +131,19 @@ TEST(LlvmLibcUtilsTest, Adjust2) {
}
}
+TEST(LlvmLibcUtilsTest, Align1) {
+ char a;
+ const size_t base_size = 10;
+ {
+ auto *ptr = &a;
+ size_t size = base_size;
+ align<128>(ptr, size);
+ EXPECT_TRUE(uintptr_t(ptr) % 128 == 0);
+ EXPECT_GE(ptr, &a);
+ EXPECT_EQ(size_t(ptr - &a), base_size - size);
+ }
+}
+
TEST(LlvmLibcUtilsTest, Align2) {
char a, b;
const size_t base_size = 10;
@@ -124,10 +151,10 @@ TEST(LlvmLibcUtilsTest, Align2) {
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
- align_to_next_boundary<128, Arg::P1>(p1, p2, size);
+ align<128, Arg::_1>(p1, p2, size);
EXPECT_TRUE(uintptr_t(p1) % 128 == 0);
- EXPECT_GT(p1, &a);
- EXPECT_GT(p2, &b);
+ EXPECT_GE(p1, &a);
+ EXPECT_GE(p2, &b);
EXPECT_EQ(size_t(p1 - &a), base_size - size);
EXPECT_EQ(size_t(p2 - &b), base_size - size);
}
@@ -135,10 +162,10 @@ TEST(LlvmLibcUtilsTest, Align2) {
auto *p1 = &a;
auto *p2 = &b;
size_t size = base_size;
- align_to_next_boundary<128, Arg::P2>(p1, p2, size);
+ align<128, Arg::_2>(p1, p2, size);
EXPECT_TRUE(uintptr_t(p2) % 128 == 0);
- EXPECT_GT(p1, &a);
- EXPECT_GT(p2, &b);
+ EXPECT_GE(p1, &a);
+ EXPECT_GE(p2, &b);
EXPECT_EQ(size_t(p1 - &a), base_size - size);
EXPECT_EQ(size_t(p2 - &b), base_size - size);
}
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index f94b50268a728..90aea2c75cc2e 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -973,10 +973,9 @@ no_sanitize_features = [
cc_library(
name = "string_memory_utils",
hdrs = [
- "src/string/memory_utils/op_aarch64.h",
- "src/string/memory_utils/op_builtin.h",
- "src/string/memory_utils/op_generic.h",
- "src/string/memory_utils/op_x86.h",
+ "src/string/memory_utils/elements.h",
+ "src/string/memory_utils/elements_aarch64.h",
+ "src/string/memory_utils/elements_x86.h",
"src/string/memory_utils/utils.h",
],
textual_hdrs = [
@@ -989,8 +988,6 @@ cc_library(
deps = [
":__support_common",
":__support_cpp_bit",
- ":__support_cpp_type_traits",
- ":__support_cpp_array",
":libc_root",
],
)
More information about the libc-commits
mailing list