[libc-commits] [libc] e49a608 - Revert D148717 "[libc] Improve memcmp latency and codegen"
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Mon Jun 5 02:51:28 PDT 2023
Author: Guillaume Chatelet
Date: 2023-06-05T09:50:30Z
New Revision: e49a6085111b9cbba69b330f9bfc6e0aae9e53e8
URL: https://github.com/llvm/llvm-project/commit/e49a6085111b9cbba69b330f9bfc6e0aae9e53e8
DIFF: https://github.com/llvm/llvm-project/commit/e49a6085111b9cbba69b330f9bfc6e0aae9e53e8.diff
LOG: Revert D148717 "[libc] Improve memcmp latency and codegen"
This reverts commit 9ec6ebd3ceabb29482aa18a64b943788b65223dc.
The patch broke RISCV and aarch64 builtbots.
Added:
Modified:
libc/src/string/CMakeLists.txt
libc/src/string/memory_utils/CMakeLists.txt
libc/src/string/memory_utils/bcmp_implementations.h
libc/src/string/memory_utils/memcmp_implementations.h
libc/src/string/memory_utils/op_generic.h
libc/src/string/memory_utils/op_x86.h
libc/src/string/memory_utils/utils.h
libc/src/string/memory_utils/x86_64/memcmp_implementations.h
libc/test/src/string/memory_utils/op_tests.cpp
utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Removed:
################################################################################
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
index e1b1e497a76ba..554f33b21bd0a 100644
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -450,12 +450,6 @@ function(add_implementation name impl_name)
endforeach()
endif()
- if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
- # Prevent warning when passing x86 SIMD types as template arguments.
- # e.g. "warning: ignoring attributes on template argument ā__m128iā [-Wignored-attributes]"
- list(APPEND ADD_IMPL_COMPILE_OPTIONS "-Wno-ignored-attributes")
- endif()
-
add_entrypoint_object(${impl_name}
NAME ${name}
SRCS ${ADD_IMPL_SRCS}
@@ -570,7 +564,7 @@ endfunction()
if(${LIBC_TARGET_ARCHITECTURE_IS_X86})
add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=k8 REQUIRE SSE2)
add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2)
- add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=haswell REQUIRE AVX)
+ add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2)
add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F)
add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE})
add_memcpy(memcpy)
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
index 5b96218600f95..7bb0e960ee13d 100644
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -24,7 +24,6 @@ add_header_library(
libc.src.__support.CPP.type_traits
libc.src.__support.macros.config
libc.src.__support.macros.optimization
- libc.src.__support.macros.properties.architectures
)
add_header_library(
diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h
index d811c1612689a..070e7794102ff 100644
--- a/libc/src/string/memory_utils/bcmp_implementations.h
+++ b/libc/src/string/memory_utils/bcmp_implementations.h
@@ -22,17 +22,21 @@
namespace __llvm_libc {
[[maybe_unused]] LIBC_INLINE BcmpReturnType
-inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
- return generic::Bcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
+inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
+ LIBC_LOOP_NOUNROLL
+ for (; offset < count; ++offset)
+ if (p1[offset] != p2[offset])
+ return BcmpReturnType::NONZERO();
+ return BcmpReturnType::ZERO();
}
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint64_t);
if (count <= 2 * kAlign)
- return inline_bcmp_byte_per_byte(p1, p2, count);
+ return inline_bcmp_byte_per_byte(p1, p2, 0, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
- if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
+ if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
@@ -51,16 +55,16 @@ inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
if (a != b)
return BcmpReturnType::NONZERO();
}
- return inline_bcmp_byte_per_byte(p1, p2, count, offset);
+ return inline_bcmp_byte_per_byte(p1, p2, offset, count);
}
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint32_t);
if (count <= 2 * kAlign)
- return inline_bcmp_byte_per_byte(p1, p2, count);
+ return inline_bcmp_byte_per_byte(p1, p2, 0, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
- if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
+ if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
@@ -76,82 +80,89 @@ inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
if (a != b)
return BcmpReturnType::NONZERO();
}
- return inline_bcmp_byte_per_byte(p1, p2, count, offset);
+ return inline_bcmp_byte_per_byte(p1, p2, offset, count);
}
#if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
- return generic::Bcmp<uint64_t>::loop_and_tail_align_above(256, p1, p2, count);
+ if (count < 256)
+ return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
+ if (auto value = generic::Bcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
}
#endif // defined(LIBC_TARGET_ARCH_IS_X86) ||
// defined(LIBC_TARGET_ARCH_IS_AARCH64)
#if defined(LIBC_TARGET_ARCH_IS_X86)
-#if defined(__SSE4_1__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
-inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
+inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
- return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count);
+ return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+ if (count < 256)
+ return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
+ if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
}
-#endif // __SSE4_1__
-#if defined(__AVX__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
-inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) {
+inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+ return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
- return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
- return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count);
+ return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
+ if (count <= 128)
+ return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
+ if (LIBC_UNLIKELY(count >= 256)) {
+ if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
}
-#endif // __AVX__
-#if defined(__AVX512BW__)
[[maybe_unused]] LIBC_INLINE BcmpReturnType
inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Bcmp<__m128i>::head_tail(p1, p2, count);
+ return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
- return generic::Bcmp<__m256i>::head_tail(p1, p2, count);
+ return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
- return generic::Bcmp<__m512i>::head_tail(p1, p2, count);
- return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count);
+ return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
+ if (LIBC_UNLIKELY(count >= 256)) {
+ if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
}
-#endif // __AVX512BW__
[[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2,
size_t count) {
if (count == 0)
return BcmpReturnType::ZERO();
if (count == 1)
- return generic::Bcmp<uint8_t>::block(p1, p2);
+ return generic::Bcmp<1>::block(p1, p2);
if (count == 2)
- return generic::Bcmp<uint16_t>::block(p1, p2);
- if (count == 3)
- return generic::BcmpSequence<uint16_t, uint8_t>::block(p1, p2);
- if (count == 4)
- return generic::Bcmp<uint32_t>::block(p1, p2);
- if (count == 5)
- return generic::BcmpSequence<uint32_t, uint8_t>::block(p1, p2);
- if (count == 6)
- return generic::BcmpSequence<uint32_t, uint16_t>::block(p1, p2);
- if (count == 7)
- return generic::BcmpSequence<uint32_t, uint16_t, uint8_t>::block(p1, p2);
- if (count == 8)
- return generic::Bcmp<uint64_t>::block(p1, p2);
+ return generic::Bcmp<2>::block(p1, p2);
+ if (count <= 4)
+ return generic::Bcmp<2>::head_tail(p1, p2, count);
+ if (count <= 8)
+ return generic::Bcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
- return generic::Bcmp<uint64_t>::head_tail(p1, p2, count);
-#if defined(__AVX512BW__)
- return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
-#elif defined(__AVX__)
- return inline_bcmp_x86_avx_gt16(p1, p2, count);
-#elif defined(__SSE4_1__)
- return inline_bcmp_x86_sse41_gt16(p1, p2, count);
-#else
- return inline_bcmp_generic_gt16(p1, p2, count);
-#endif
+ return generic::Bcmp<8>::head_tail(p1, p2, count);
+ if constexpr (x86::kAvx512BW)
+ return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
+ else if constexpr (x86::kAvx2)
+ return inline_bcmp_x86_avx2_gt16(p1, p2, count);
+ else if constexpr (x86::kSse2)
+ return inline_bcmp_x86_sse2_gt16(p1, p2, count);
+ else
+ return inline_bcmp_generic_gt16(p1, p2, count);
}
#endif // defined(LIBC_TARGET_ARCH_IS_X86)
@@ -214,7 +225,7 @@ LIBC_INLINE BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
return inline_bcmp_aligned_access_32bit(p1, p2, count);
#else
- return inline_bcmp_byte_per_byte(p1, p2, count);
+ return inline_bcmp_byte_per_byte(p1, p2, 0, count);
#endif
}
diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h
index 39208bd6c1e44..d870ec4144020 100644
--- a/libc/src/string/memory_utils/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/memcmp_implementations.h
@@ -26,17 +26,21 @@
namespace __llvm_libc {
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
-inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) {
- return generic::Memcmp<uint8_t>::loop_and_tail_offset(p1, p2, count, offset);
+inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) {
+ LIBC_LOOP_NOUNROLL
+ for (; offset < count; ++offset)
+ if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset))
+ return value;
+ return MemcmpReturnType::ZERO();
}
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint64_t);
if (count <= 2 * kAlign)
- return inline_memcmp_byte_per_byte(p1, p2, count);
+ return inline_memcmp_byte_per_byte(p1, p2, 0, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
- if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
+ if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
@@ -52,20 +56,21 @@ inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) {
b = load64_aligned<uint8_t, uint16_t, uint16_t, uint16_t, uint8_t>(
p2, offset);
uint64_t a = load64_aligned<uint64_t>(p1, offset);
- if (a != b)
- return cmp_neq_uint64_t(Endian::to_big_endian(a),
- Endian::to_big_endian(b));
+ if (a != b) {
+ // TODO use cmp_neq_uint64_t from D148717 once it's submitted.
+ return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1;
+ }
}
- return inline_memcmp_byte_per_byte(p1, p2, count, offset);
+ return inline_memcmp_byte_per_byte(p1, p2, offset, count);
}
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
constexpr size_t kAlign = sizeof(uint32_t);
if (count <= 2 * kAlign)
- return inline_memcmp_byte_per_byte(p1, p2, count);
+ return inline_memcmp_byte_per_byte(p1, p2, 0, count);
size_t bytes_to_p1_align = distance_to_align_up<kAlign>(p1);
- if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align))
+ if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align))
return value;
size_t offset = bytes_to_p1_align;
size_t p2_alignment = distance_to_align_down<kAlign>(p2 + offset);
@@ -78,10 +83,16 @@ inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) {
else
b = load32_aligned<uint8_t, uint16_t, uint8_t>(p2, offset);
uint32_t a = load32_aligned<uint32_t>(p1, offset);
- if (a != b)
- return cmp_uint32_t(Endian::to_big_endian(a), Endian::to_big_endian(b));
+ if (a != b) {
+ // TODO use cmp_uint32_t from D148717 once it's submitted.
+ // We perform the
diff erence as an uint64_t.
+ const int64_t
diff = static_cast<int64_t>(Endian::to_big_endian(a)) -
+ static_cast<int64_t>(Endian::to_big_endian(b));
+ // And reduce the uint64_t into an uint32_t.
+ return static_cast<int32_t>((
diff >> 1) | (
diff & 0xFFFF));
+ }
}
- return inline_memcmp_byte_per_byte(p1, p2, count, offset);
+ return inline_memcmp_byte_per_byte(p1, p2, offset, count);
}
LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
@@ -94,7 +105,7 @@ LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
#elif defined(LIBC_TARGET_ARCH_IS_RISCV32)
return inline_memcmp_aligned_access_32bit(p1, p2, count);
#else
- return inline_memcmp_byte_per_byte(p1, p2, count);
+ return inline_memcmp_byte_per_byte(p1, p2, 0, count);
#endif
}
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
index 323772fef94c0..663f42809ecc9 100644
--- a/libc/src/string/memory_utils/op_generic.h
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -33,14 +33,6 @@
#include <stdint.h>
-static_assert((UINTPTR_MAX == 4294967295U) ||
- (UINTPTR_MAX == 18446744073709551615UL),
- "We currently only support 32- or 64-bit platforms");
-
-#if defined(LIBC_TARGET_ARCH_IS_X86_64) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
-#define LLVM_LIBC_HAS_UINT64
-#endif
-
namespace __llvm_libc {
// Compiler types using the vector attributes.
using uint8x1_t = uint8_t __attribute__((__vector_size__(1)));
@@ -53,31 +45,19 @@ using uint8x64_t = uint8_t __attribute__((__vector_size__(64)));
} // namespace __llvm_libc
namespace __llvm_libc::generic {
-
// We accept three types of values as elements for generic operations:
-// - scalar : unsigned integral types,
-// - vector : compiler types using the vector attributes or platform builtins,
+// - scalar : unsigned integral types
+// - vector : compiler types using the vector attributes
// - array : a cpp::array<T, N> where T is itself either a scalar or a vector.
// The following traits help discriminate between these cases.
+template <typename T>
+constexpr bool is_scalar_v = cpp::is_integral_v<T> && cpp::is_unsigned_v<T>;
-template <typename T> struct is_scalar : cpp::false_type {};
-template <> struct is_scalar<uint8_t> : cpp::true_type {};
-template <> struct is_scalar<uint16_t> : cpp::true_type {};
-template <> struct is_scalar<uint32_t> : cpp::true_type {};
-#ifdef LLVM_LIBC_HAS_UINT64
-template <> struct is_scalar<uint64_t> : cpp::true_type {};
-#endif // LLVM_LIBC_HAS_UINT64
-template <typename T> constexpr bool is_scalar_v = is_scalar<T>::value;
-
-template <typename T> struct is_vector : cpp::false_type {};
-template <> struct is_vector<uint8x1_t> : cpp::true_type {};
-template <> struct is_vector<uint8x2_t> : cpp::true_type {};
-template <> struct is_vector<uint8x4_t> : cpp::true_type {};
-template <> struct is_vector<uint8x8_t> : cpp::true_type {};
-template <> struct is_vector<uint8x16_t> : cpp::true_type {};
-template <> struct is_vector<uint8x32_t> : cpp::true_type {};
-template <> struct is_vector<uint8x64_t> : cpp::true_type {};
-template <typename T> constexpr bool is_vector_v = is_vector<T>::value;
+template <typename T>
+constexpr bool is_vector_v =
+ cpp::details::is_unqualified_any_of<T, uint8x1_t, uint8x2_t, uint8x4_t,
+ uint8x8_t, uint8x16_t, uint8x32_t,
+ uint8x64_t>();
template <class T> struct is_array : cpp::false_type {};
template <class T, size_t N> struct is_array<cpp::array<T, N>> {
@@ -89,7 +69,7 @@ template <typename T>
constexpr bool is_element_type_v =
is_scalar_v<T> || is_vector_v<T> || is_array_v<T>;
-// Helper struct to retrieve the number of elements of an array.
+//
template <class T> struct array_size {};
template <class T, size_t N>
struct array_size<cpp::array<T, N>> : cpp::integral_constant<size_t, N> {};
@@ -134,15 +114,105 @@ template <typename T> T splat(uint8_t value) {
}
}
+static_assert((UINTPTR_MAX == 4294967295U) ||
+ (UINTPTR_MAX == 18446744073709551615UL),
+ "We currently only support 32- or 64-bit platforms");
+
+#if defined(LIBC_TARGET_ARCH_IS_X86_64) || defined(LIBC_TARGET_ARCH_IS_AARCH64)
+#define LLVM_LIBC_HAS_UINT64
+#endif
+
+namespace details {
+// Checks that each type is sorted in strictly decreasing order of size.
+// i.e. sizeof(First) > sizeof(Second) > ... > sizeof(Last)
+template <typename First> constexpr bool is_decreasing_size() {
+ return sizeof(First) == 1;
+}
+template <typename First, typename Second, typename... Next>
+constexpr bool is_decreasing_size() {
+ if constexpr (sizeof...(Next) > 0)
+ return sizeof(First) > sizeof(Second) && is_decreasing_size<Next...>();
+ else
+ return sizeof(First) > sizeof(Second) && is_decreasing_size<Second>();
+}
+
+template <size_t Size, typename... Ts> struct Largest;
+template <size_t Size> struct Largest<Size> : cpp::type_identity<uint8_t> {};
+template <size_t Size, typename T, typename... Ts>
+struct Largest<Size, T, Ts...> {
+ using next = Largest<Size, Ts...>;
+ using type = cpp::conditional_t<(Size >= sizeof(T)), T, typename next::type>;
+};
+
+} // namespace details
+
+// 'SupportedTypes' holds a list of natively supported types.
+// The types are instanciations of ScalarType or VectorType.
+// They should be ordered in strictly decreasing order.
+// The 'TypeFor<Size>' type retrieves is the largest supported type that can
+// handle 'Size' bytes. e.g.
+//
+// using ST = SupportedTypes<ScalarType<uint16_t>, ScalarType<uint8_t>>;
+// using Type = ST::TypeFor<10>;
+// static_assert(cpp:is_same_v<Type, ScalarType<uint16_t>>);
+
+template <typename First, typename... Ts> struct SupportedTypes {
+ static_assert(details::is_decreasing_size<First, Ts...>());
+
+ using MaxType = First;
+
+ template <size_t Size>
+ using TypeFor = typename details::Largest<Size, First, Ts...>::type;
+};
+
+// Map from sizes to structures offering static load, store and splat methods.
+// Note: On platforms lacking vector support, we use the ArrayType below and
+// decompose the operation in smaller pieces.
+
+// Lists a generic native types to use for Memset and Memmove operations.
+// TODO: Inject the native types within Memset and Memmove depending on the
+// target architectures and derive MaxSize from it.
+using NativeTypeMap = SupportedTypes<uint8x64_t, //
+ uint8x32_t, //
+ uint8x16_t,
+#if defined(LLVM_LIBC_HAS_UINT64)
+ uint64_t, // Not available on 32bit
+#endif
+ uint32_t, //
+ uint16_t, //
+ uint8_t>;
+
+namespace details {
+
+// Helper to test if a type is void.
+template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
+
+// In case the 'Size' is not supported we can fall back to a sequence of smaller
+// operations using the largest natively supported type.
+template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
+ return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
+ !details::is_void_v<NativeTypeMap::TypeFor<MaxSize>>;
+}
+
+// Compute the type to handle an operation of 'Size' bytes knowing that the
+// underlying platform only support native types up to MaxSize bytes.
+template <size_t Size, size_t MaxSize>
+using getTypeFor = cpp::conditional_t<
+ useArrayType<Size, MaxSize>(),
+ cpp::array<NativeTypeMap::TypeFor<MaxSize>, Size / MaxSize>,
+ NativeTypeMap::TypeFor<Size>>;
+
+} // namespace details
+
///////////////////////////////////////////////////////////////////////////////
// Memset
///////////////////////////////////////////////////////////////////////////////
template <typename T> struct Memset {
- static_assert(is_element_type_v<T>);
static constexpr size_t SIZE = sizeof(T);
LIBC_INLINE static void block(Ptr dst, uint8_t value) {
+ static_assert(is_element_type_v<T>);
if constexpr (is_scalar_v<T> || is_vector_v<T>) {
store<T>(dst, splat<T>(value));
} else if constexpr (is_array_v<T>) {
@@ -177,8 +247,9 @@ template <typename T, typename... TS> struct MemsetSequence {
static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS));
LIBC_INLINE static void block(Ptr dst, uint8_t value) {
Memset<T>::block(dst, value);
- if constexpr (sizeof...(TS) > 0)
+ if constexpr (sizeof...(TS) > 0) {
return MemsetSequence<TS...>::block(dst + sizeof(T), value);
+ }
}
};
@@ -187,7 +258,6 @@ template <typename T, typename... TS> struct MemsetSequence {
///////////////////////////////////////////////////////////////////////////////
template <typename T> struct Memmove {
- static_assert(is_element_type_v<T>);
static constexpr size_t SIZE = sizeof(T);
LIBC_INLINE static void block(Ptr dst, CPtr src) {
@@ -320,312 +390,136 @@ template <typename T> struct Memmove {
};
///////////////////////////////////////////////////////////////////////////////
-// Low level operations for Bcmp and Memcmp that operate on memory locations.
-///////////////////////////////////////////////////////////////////////////////
-
-// Same as load above but with an offset to the pointer.
-// Making the offset explicit hints the compiler to use relevant addressing mode
-// consistently.
-template <typename T> LIBC_INLINE static T load(CPtr ptr, size_t offset) {
- return ::__llvm_libc::load<T>(ptr + offset);
-}
-
-// Same as above but also makes sure the loaded value is in big endian format.
-// This is useful when implementing lexicograhic comparisons as big endian
-// scalar comparison directly maps to lexicographic byte comparisons.
-template <typename T> LIBC_INLINE static T load_be(CPtr ptr, size_t offset) {
- return Endian::to_big_endian(load<T>(ptr, offset));
-}
-
-// Equality: returns true iff values at locations (p1 + offset) and (p2 +
-// offset) compare equal.
-template <typename T> static bool eq(CPtr p1, CPtr p2, size_t offset);
-
-// Not equals: returns non-zero iff values at locations (p1 + offset) and (p2 +
-// offset)
diff er.
-template <typename T> static uint32_t neq(CPtr p1, CPtr p2, size_t offset);
-
-// Lexicographic comparison:
-// - returns 0 iff values at locations (p1 + offset) and (p2 + offset) compare
-// equal.
-// - returns a negative value if value at location (p1 + offset) is
-// lexicographically less than value at (p2 + offset).
-// - returns a positive value if value at location (p1 + offset) is
-// lexicographically greater than value at (p2 + offset).
-template <typename T>
-static MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset);
-
-// Lexicographic comparison of non-equal values:
-// - returns a negative value if value at location (p1 + offset) is
-// lexicographically less than value at (p2 + offset).
-// - returns a positive value if value at location (p1 + offset) is
-// lexicographically greater than value at (p2 + offset).
-template <typename T>
-static MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset);
-
-///////////////////////////////////////////////////////////////////////////////
-// Memcmp implementation
-//
-// When building memcmp, not all types are considered equals.
-//
-// For instance, the lexicographic comparison of two uint8_t can be implemented
-// as a simple subtraction, but for wider operations the logic can be much more
-// involving, especially on little endian platforms.
-//
-// For such wider types it is a good strategy to test for equality first and
-// only do the expensive lexicographic comparison if necessary.
-//
-// Decomposing the algorithm like this for wider types allows us to have
-// efficient implementation of higher order functions like 'head_tail' or
-// 'loop_and_tail'.
+// Bcmp
///////////////////////////////////////////////////////////////////////////////
-
-// Type traits to decide whether we can use 'cmp' directly or if we need to
-// split the computation.
-template <typename T> struct cmp_is_expensive;
-template <> struct cmp_is_expensive<uint8_t> : public cpp::false_type {};
-template <> struct cmp_is_expensive<uint16_t> : public cpp::false_type {};
-template <> struct cmp_is_expensive<uint32_t> : public cpp::false_type {};
-#ifdef LLVM_LIBC_HAS_UINT64
-template <> struct cmp_is_expensive<uint64_t> : public cpp::true_type {};
-#endif // LLVM_LIBC_HAS_UINT64
-
-template <typename T> struct Memcmp {
- static_assert(is_element_type_v<T>);
- static constexpr size_t SIZE = sizeof(T);
-
-private:
- LIBC_INLINE static MemcmpReturnType block_offset(CPtr p1, CPtr p2,
- size_t offset) {
- if constexpr (cmp_is_expensive<T>::value) {
- if (!eq<T>(p1, p2, offset))
- return cmp_neq<T>(p1, p2, offset);
- return MemcmpReturnType::ZERO();
- } else {
- return cmp<T>(p1, p2, offset);
- }
+template <size_t Size> struct Bcmp {
+ static constexpr size_t SIZE = Size;
+ static constexpr size_t MaxSize = LLVM_LIBC_IS_DEFINED(LLVM_LIBC_HAS_UINT64)
+ ? sizeof(uint64_t)
+ : sizeof(uint32_t);
+
+ template <typename T> LIBC_INLINE static uint32_t load_xor(CPtr p1, CPtr p2) {
+ static_assert(sizeof(T) <= sizeof(uint32_t));
+ return load<T>(p1) ^ load<T>(p2);
}
-public:
- LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
- return block_offset(p1, p2, 0);
+ template <typename T>
+ LIBC_INLINE static uint32_t load_not_equal(CPtr p1, CPtr p2) {
+ return load<T>(p1) != load<T>(p2);
}
- LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2,
- size_t count) {
- if constexpr (cmp_is_expensive<T>::value) {
- if (!eq<T>(p1, p2, 0))
- return cmp_neq<T>(p1, p2, 0);
+ LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
+ if constexpr (Size == 1) {
+ return load_xor<uint8_t>(p1, p2);
+ } else if constexpr (Size == 2) {
+ return load_xor<uint16_t>(p1, p2);
+ } else if constexpr (Size == 4) {
+ return load_xor<uint32_t>(p1, p2);
+ } else if constexpr (Size == 8) {
+ return load_not_equal<uint64_t>(p1, p2);
+ } else if constexpr (details::useArrayType<Size, MaxSize>()) {
+ for (size_t offset = 0; offset < Size; offset += MaxSize)
+ if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+ return value;
} else {
- if (const auto value = cmp<T>(p1, p2, 0))
- return value;
+ deferred_static_assert("Unimplemented Size");
}
- return block_offset(p1, p2, count - SIZE); // tail
+ return BcmpReturnType::ZERO();
}
- LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
- size_t count) {
- return loop_and_tail_offset(p1, p2, count, 0);
- }
-
- LIBC_INLINE static MemcmpReturnType
- loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) {
- if constexpr (SIZE > 1) {
- const size_t limit = count - SIZE;
- LIBC_LOOP_NOUNROLL
- for (; offset < limit; offset += SIZE) {
- if (!eq<T>(p1, p2, offset)) {
- if constexpr (cmp_is_expensive<T>::value) {
- return cmp_neq<T>(p1, p2, offset);
- } else {
- return cmp<T>(p1, p2, offset);
- }
- }
- }
- return block_offset(p1, p2, limit); // tail
- } else {
- // No need for a tail operation when SIZE == 1.
- LIBC_LOOP_NOUNROLL
- for (; offset < count; offset += SIZE)
- if (auto value = cmp<T>(p1, p2, offset))
- return value;
- return MemcmpReturnType::ZERO();
- }
+ LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+ return block(p1 + count - SIZE, p2 + count - SIZE);
}
- LIBC_INLINE static MemcmpReturnType
- loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) {
- const AlignHelper<sizeof(T)> helper(p1);
- if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) {
- if (auto value = block(p1, p2))
- return value;
- adjust(helper.offset(), p1, p2, count);
- }
- return loop_and_tail(p1, p2, count);
+ LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+ return block(p1, p2) | tail(p1, p2, count);
}
-};
-template <typename T, typename... TS> struct MemcmpSequence {
- static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS));
- LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
- // TODO: test suggestion in
- // https://reviews.llvm.org/D148717?id=515724#inline-1446890
- // once we have a proper way to check memory operation latency.
- if constexpr (cmp_is_expensive<T>::value) {
- if (!eq<T>(p1, p2, 0))
- return cmp_neq<T>(p1, p2, 0);
- } else {
- if (auto value = cmp<T>(p1, p2, 0))
+ LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
+ size_t count) {
+ static_assert(Size > 1, "a loop of size 1 does not need tail");
+ size_t offset = 0;
+ do {
+ if (auto value = block(p1 + offset, p2 + offset))
return value;
- }
- if constexpr (sizeof...(TS) > 0)
- return MemcmpSequence<TS...>::block(p1 + sizeof(T), p2 + sizeof(T));
- else
- return MemcmpReturnType::ZERO();
+ offset += SIZE;
+ } while (offset < count - SIZE);
+ return tail(p1, p2, count);
}
};
///////////////////////////////////////////////////////////////////////////////
-// Bcmp
+// Memcmp
///////////////////////////////////////////////////////////////////////////////
-template <typename T> struct Bcmp {
- static_assert(is_element_type_v<T>);
- static constexpr size_t SIZE = sizeof(T);
-
- LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
- return neq<T>(p1, p2, 0);
+template <size_t Size> struct Memcmp {
+ static constexpr size_t SIZE = Size;
+ static constexpr size_t MaxSize = LLVM_LIBC_IS_DEFINED(LLVM_LIBC_HAS_UINT64)
+ ? sizeof(uint64_t)
+ : sizeof(uint32_t);
+
+ template <typename T> LIBC_INLINE static T load_be(CPtr ptr) {
+ return Endian::to_big_endian(load<T>(ptr));
}
- LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
- const size_t tail_offset = count - SIZE;
- return neq<T>(p1, p2, tail_offset);
+ template <typename T>
+ LIBC_INLINE static MemcmpReturnType load_be_
diff (CPtr p1, CPtr p2) {
+ return load_be<T>(p1) - load_be<T>(p2);
}
- LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
- if (const auto value = neq<T>(p1, p2, 0))
- return value;
- return tail(p1, p2, count);
- }
-
- LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
- size_t count) {
- return loop_and_tail_offset(p1, p2, count, 0);
+ template <typename T>
+ LIBC_INLINE static MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
+ const auto la = load_be<T>(p1);
+ const auto lb = load_be<T>(p2);
+ return la > lb ? 1 : la < lb ? -1 : 0;
}
- LIBC_INLINE static BcmpReturnType
- loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) {
- if constexpr (SIZE > 1) {
- const size_t limit = count - SIZE;
- LIBC_LOOP_NOUNROLL
- for (; offset < limit; offset += SIZE)
- if (const auto value = neq<T>(p1, p2, offset))
- return value;
- return tail(p1, p2, count);
+ LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
+ if constexpr (Size == 1) {
+ return load_be_
diff <uint8_t>(p1, p2);
+ } else if constexpr (Size == 2) {
+ return load_be_
diff <uint16_t>(p1, p2);
+ } else if constexpr (Size == 4) {
+ return load_be_cmp<uint32_t>(p1, p2);
+ } else if constexpr (Size == 8) {
+ return load_be_cmp<uint64_t>(p1, p2);
+ } else if constexpr (details::useArrayType<Size, MaxSize>()) {
+ for (size_t offset = 0; offset < Size; offset += MaxSize)
+ if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+ return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
+ return MemcmpReturnType::ZERO();
+ } else if constexpr (Size == 3) {
+ if (auto value = Memcmp<2>::block(p1, p2))
+ return value;
+ return Memcmp<1>::block(p1 + 2, p2 + 2);
} else {
- // No need for a tail operation when SIZE == 1.
- LIBC_LOOP_NOUNROLL
- for (; offset < count; offset += SIZE)
- if (const auto value = neq<T>(p1, p2, offset))
- return value;
- return BcmpReturnType::ZERO();
+ deferred_static_assert("Unimplemented Size");
}
}
- LIBC_INLINE static BcmpReturnType
- loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) {
- static_assert(SIZE > 1,
- "No need to align when processing one byte at a time");
- const AlignHelper<sizeof(T)> helper(p1);
- if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) {
- if (auto value = block(p1, p2))
- return value;
- adjust(helper.offset(), p1, p2, count);
- }
- return loop_and_tail(p1, p2, count);
+ LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+ return block(p1 + count - SIZE, p2 + count - SIZE);
}
-};
-template <typename T, typename... TS> struct BcmpSequence {
- static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS));
- LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
- if (auto value = neq<T>(p1, p2, 0))
+ LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2,
+ size_t count) {
+ if (auto value = block(p1, p2))
return value;
- if constexpr (sizeof...(TS) > 0)
- return BcmpSequence<TS...>::block(p1 + sizeof(T), p2 + sizeof(T));
- else
- return BcmpReturnType::ZERO();
+ return tail(p1, p2, count);
}
-};
-
-///////////////////////////////////////////////////////////////////////////////
-// Specializations for uint8_t
-template <> LIBC_INLINE bool eq<uint8_t>(CPtr p1, CPtr p2, size_t offset) {
- return load<uint8_t>(p1, offset) == load<uint8_t>(p2, offset);
-}
-template <> LIBC_INLINE uint32_t neq<uint8_t>(CPtr p1, CPtr p2, size_t offset) {
- return load<uint8_t>(p1, offset) ^ load<uint8_t>(p2, offset);
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp<uint8_t>(CPtr p1, CPtr p2, size_t offset) {
- return static_cast<int32_t>(load<uint8_t>(p1, offset)) -
- static_cast<int32_t>(load<uint8_t>(p2, offset));
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp_neq<uint8_t>(CPtr p1, CPtr p2, size_t offset);
-
-///////////////////////////////////////////////////////////////////////////////
-// Specializations for uint16_t
-template <> LIBC_INLINE bool eq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
- return load<uint16_t>(p1, offset) == load<uint16_t>(p2, offset);
-}
-template <>
-LIBC_INLINE uint32_t neq<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
- return load<uint16_t>(p1, offset) ^ load<uint16_t>(p2, offset);
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp<uint16_t>(CPtr p1, CPtr p2, size_t offset) {
- return static_cast<int32_t>(load_be<uint16_t>(p1, offset)) -
- static_cast<int32_t>(load_be<uint16_t>(p2, offset));
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp_neq<uint16_t>(CPtr p1, CPtr p2, size_t offset);
-///////////////////////////////////////////////////////////////////////////////
-// Specializations for uint32_t
-template <> LIBC_INLINE bool eq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
- return load<uint32_t>(p1, offset) == load<uint32_t>(p2, offset);
-}
-template <>
-LIBC_INLINE uint32_t neq<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
- return load<uint32_t>(p1, offset) ^ load<uint32_t>(p2, offset);
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp<uint32_t>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load_be<uint32_t>(p1, offset);
- const auto b = load_be<uint32_t>(p2, offset);
- return cmp_uint32_t(a, b);
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp_neq<uint32_t>(CPtr p1, CPtr p2, size_t offset);
+ LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
+ size_t count) {
+ static_assert(Size > 1, "a loop of size 1 does not need tail");
+ size_t offset = 0;
+ do {
+ if (auto value = block(p1 + offset, p2 + offset))
+ return value;
+ offset += SIZE;
+ } while (offset < count - SIZE);
+ return tail(p1, p2, count);
+ }
+};
-///////////////////////////////////////////////////////////////////////////////
-// Specializations for uint64_t
-template <> LIBC_INLINE bool eq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
- return load<uint64_t>(p1, offset) == load<uint64_t>(p2, offset);
-}
-template <>
-LIBC_INLINE uint32_t neq<uint64_t>(CPtr p1, CPtr p2, size_t offset) {
- return !eq<uint64_t>(p1, p2, offset);
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp<uint64_t>(CPtr p1, CPtr p2, size_t offset);
-template <>
-LIBC_INLINE MemcmpReturnType cmp_neq<uint64_t>(CPtr p1, CPtr p2,
- size_t offset) {
- const auto a = load_be<uint64_t>(p1, offset);
- const auto b = load_be<uint64_t>(p2, offset);
- return cmp_neq_uint64_t(a, b);
-}
} // namespace __llvm_libc::generic
#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 92344381386b4..dcf7405240c73 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -40,13 +40,11 @@
namespace __llvm_libc::x86 {
// A set of constants to check compile time features.
-static LIBC_INLINE constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
-static LIBC_INLINE constexpr bool kSse41 = LLVM_LIBC_IS_DEFINED(__SSE4_1__);
-static LIBC_INLINE constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
-static LIBC_INLINE constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
-static LIBC_INLINE constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
-static LIBC_INLINE constexpr bool kAvx512BW =
- LLVM_LIBC_IS_DEFINED(__AVX512BW__);
+static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
+static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
+static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
+static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
///////////////////////////////////////////////////////////////////////////////
// Memcpy repmovsb implementation
@@ -56,142 +54,220 @@ struct Memcpy {
}
};
-} // namespace __llvm_libc::x86
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
-namespace __llvm_libc::generic {
+// Base implementation for the Bcmp specializations.
+// - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+// - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
+ static constexpr size_t SIZE = Size;
+ LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) {
+ if constexpr (Size == BlockSize) {
+ return BlockBcmp(p1, p2);
+ } else if constexpr (Size % BlockSize == 0) {
+ for (size_t offset = 0; offset < Size; offset += BlockSize)
+ if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+ return value;
+ } else {
+ deferred_static_assert("SIZE not implemented");
+ }
+ return BcmpReturnType::ZERO();
+ }
-///////////////////////////////////////////////////////////////////////////////
-// Specializations for __m128i
-#if defined(__SSE4_1__)
-template <> struct is_vector<__m128i> : cpp::true_type {};
-template <> struct cmp_is_expensive<__m128i> : cpp::true_type {};
-LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) {
- return _mm_max_epu8(a, b);
-}
-LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
- return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15));
-}
-LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
- return _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value)));
+ LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+ return block(p1 + count - Size, p2 + count - Size);
+ }
+
+ LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+ return block(p1, p2) | tail(p1, p2, count);
+ }
+
+ LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
+ size_t count) {
+ static_assert(Size > 1, "a loop of size 1 does not need tail");
+ size_t offset = 0;
+ do {
+ if (auto value = block(p1 + offset, p2 + offset))
+ return value;
+ offset += Size;
+ } while (offset < count - Size);
+ return tail(p1, p2, count);
+ }
+};
+
+namespace sse2 {
+LIBC_INLINE BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
+#if defined(__SSE2__)
+ using T = char __attribute__((__vector_size__(16)));
+ // A mask indicating which bytes
diff er after loading 16 bytes from p1 and p2.
+ const int mask =
+ _mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2)));
+ return static_cast<uint32_t>(mask);
+#else
+ (void)p1;
+ (void)p2;
+ return BcmpReturnType::ZERO();
+#endif // defined(__SSE2__)
}
-template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m128i>(p1, offset);
- const auto b = load<__m128i>(p2, offset);
- const auto xored = _mm_xor_si128(a, b);
- return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0
+template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+LIBC_INLINE BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
+#if defined(__AVX2__)
+ using T = char __attribute__((__vector_size__(32)));
+ // A mask indicating which bytes
diff er after loading 32 bytes from p1 and p2.
+ const int mask =
+ _mm256_movemask_epi8(cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2)));
+ // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
+ // mask.
+ return static_cast<uint32_t>(mask);
+#else
+ (void)p1;
+ (void)p2;
+ return BcmpReturnType::ZERO();
+#endif // defined(__AVX2__)
}
-template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m128i>(p1, offset);
- const auto b = load<__m128i>(p2, offset);
- const auto xored = _mm_xor_si128(a, b);
- return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0
+template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+LIBC_INLINE BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
+#if defined(__AVX512BW__)
+ using T = char __attribute__((__vector_size__(64)));
+ // A mask indicating which bytes
diff er after loading 64 bytes from p1 and p2.
+ const uint64_t mask = _mm512_cmpneq_epi8_mask(
+ cpp::bit_cast<__m512i>(load<T>(p1)), cpp::bit_cast<__m512i>(load<T>(p2)));
+ const bool mask_is_set = mask != 0;
+ return static_cast<uint32_t>(mask_is_set);
+#else
+ (void)p1;
+ (void)p2;
+ return BcmpReturnType::ZERO();
+#endif // defined(__AVX512BW__)
}
-template <>
-LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m128i>(p1, offset);
- const auto b = load<__m128i>(p2, offset);
- const auto vmax = bytewise_max(a, b);
- const auto le = big_endian_cmp_mask(vmax, b);
- const auto ge = big_endian_cmp_mask(vmax, a);
- static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint16_t>);
- return static_cast<int32_t>(ge) - static_cast<int32_t>(le);
+template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
+} // namespace avx512bw
+
+// Assuming that the mask is non zero, the index of the first mismatching byte
+// is the number of trailing zeros in the mask. Trailing zeros and not leading
+// zeros because the x86 architecture is little endian.
+LIBC_INLINE MemcmpReturnType char_
diff _no_zero(CPtr p1, CPtr p2,
+ uint64_t mask) {
+ const size_t
diff _index = __builtin_ctzll(mask);
+ const int16_t ca = cpp::to_integer<uint8_t>(p1[
diff _index]);
+ const int16_t cb = cpp::to_integer<uint8_t>(p2[
diff _index]);
+ return ca - cb;
}
-#endif // __SSE4_1__
///////////////////////////////////////////////////////////////////////////////
-// Specializations for __m256i
-#if defined(__AVX__)
-template <> struct is_vector<__m256i> : cpp::true_type {};
-template <> struct cmp_is_expensive<__m256i> : cpp::true_type {};
-template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m256i>(p1, offset);
- const auto b = load<__m256i>(p2, offset);
- const auto xored = _mm256_castps_si256(
- _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0
-}
-template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m256i>(p1, offset);
- const auto b = load<__m256i>(p2, offset);
- const auto xored = _mm256_castps_si256(
- _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
- return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0
+// Memcmp
+
+// Base implementation for the Memcmp specializations.
+// - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+// - BlockMemcmp is the function that implements the memcmp logic.
+// - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
+struct MemcmpImpl {
+ static constexpr size_t SIZE = Size;
+ LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) {
+ if constexpr (Size == BlockSize) {
+ return BlockMemcmp(p1, p2);
+ } else if constexpr (Size % BlockSize == 0) {
+ for (size_t offset = 0; offset < Size; offset += BlockSize)
+ if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+ return BlockMemcmp(p1 + offset, p2 + offset);
+ } else {
+ deferred_static_assert("SIZE not implemented");
+ }
+ return MemcmpReturnType::ZERO();
+ }
+
+ LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+ return block(p1 + count - Size, p2 + count - Size);
+ }
+
+ LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2,
+ size_t count) {
+ if (auto value = block(p1, p2))
+ return value;
+ return tail(p1, p2, count);
+ }
+
+ LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2,
+ size_t count) {
+ static_assert(Size > 1, "a loop of size 1 does not need tail");
+ size_t offset = 0;
+ do {
+ if (auto value = block(p1 + offset, p2 + offset))
+ return value;
+ offset += Size;
+ } while (offset < count - Size);
+ return tail(p1, p2, count);
+ }
+};
+
+namespace sse2 {
+LIBC_INLINE MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
+#if defined(__SSE2__)
+ using T = char __attribute__((__vector_size__(16)));
+ // A mask indicating which bytes
diff er after loading 16 bytes from p1 and p2.
+ if (int mask =
+ _mm_movemask_epi8(cpp::bit_cast<__m128i>(load<T>(p1) != load<T>(p2))))
+ return char_
diff _no_zero(p1, p2, mask);
+ return MemcmpReturnType::ZERO();
+#else
+ (void)p1;
+ (void)p2;
+ return MemcmpReturnType::ZERO();
+#endif // defined(__SSE2__)
}
-#endif // __AVX__
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
+} // namespace sse2
+namespace avx2 {
+LIBC_INLINE MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
#if defined(__AVX2__)
-LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
- return _mm256_max_epu8(a, b);
-}
-LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
- return _mm256_shuffle_epi8(value,
- _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15, //
- 16, 17, 18, 19, 20, 21, 22, 23, //
- 24, 25, 26, 27, 28, 29, 30, 31));
+ using T = char __attribute__((__vector_size__(32)));
+ // A mask indicating which bytes
diff er after loading 32 bytes from p1 and p2.
+ if (int mask = _mm256_movemask_epi8(
+ cpp::bit_cast<__m256i>(load<T>(p1) != load<T>(p2))))
+ return char_
diff _no_zero(p1, p2, mask);
+ return MemcmpReturnType::ZERO();
+#else
+ (void)p1;
+ (void)p2;
+ return MemcmpReturnType::ZERO();
+#endif // defined(__AVX2__)
}
-LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
- return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value)));
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m256i>(p1, offset);
- const auto b = load<__m256i>(p2, offset);
- const auto vmax = bytewise_max(a, b);
- const auto le = big_endian_cmp_mask(vmax, b);
- const auto ge = big_endian_cmp_mask(vmax, a);
- static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
- return cmp_uint32_t(ge, le);
-}
-#endif // __AVX2__
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
+} // namespace avx2
-///////////////////////////////////////////////////////////////////////////////
-// Specializations for __m512i
+namespace avx512bw {
+LIBC_INLINE MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
#if defined(__AVX512BW__)
-template <> struct is_vector<__m512i> : cpp::true_type {};
-template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
-LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
- return _mm512_max_epu8(a, b);
-}
-LIBC_INLINE __m512i bytewise_reverse(__m512i value) {
- return _mm512_shuffle_epi8(value,
- _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15, //
- 16, 17, 18, 19, 20, 21, 22, 23, //
- 24, 25, 26, 27, 28, 29, 30, 31, //
- 32, 33, 34, 35, 36, 37, 38, 39, //
- 40, 41, 42, 43, 44, 45, 46, 47, //
- 48, 49, 50, 51, 52, 53, 54, 55, //
- 56, 57, 58, 59, 60, 61, 62, 63));
-}
-LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
- return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value));
+ using T = char __attribute__((__vector_size__(64)));
+ // A mask indicating which bytes
diff er after loading 64 bytes from p1 and p2.
+ if (uint64_t mask =
+ _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load<T>(p1)),
+ cpp::bit_cast<__m512i>(load<T>(p2))))
+ return char_
diff _no_zero(p1, p2, mask);
+ return MemcmpReturnType::ZERO();
+#else
+ (void)p1;
+ (void)p2;
+ return MemcmpReturnType::ZERO();
+#endif // defined(__AVX512BW__)
}
-template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m512i>(p1, offset);
- const auto b = load<__m512i>(p2, offset);
- return _mm512_cmpneq_epi8_mask(a, b) == 0;
-}
-template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m512i>(p1, offset);
- const auto b = load<__m512i>(p2, offset);
- const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b);
- return (xored >> 32) | (xored & 0xFFFFFFFF);
-}
-template <>
-LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
- const auto a = load<__m512i>(p1, offset);
- const auto b = load<__m512i>(p2, offset);
- const auto vmax = bytewise_max(a, b);
- const auto le = big_endian_cmp_mask(vmax, b);
- const auto ge = big_endian_cmp_mask(vmax, a);
- static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint64_t>);
- return cmp_neq_uint64_t(ge, le);
-}
-#endif // __AVX512BW__
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
+} // namespace avx512bw
-} // namespace __llvm_libc::generic
+} // namespace __llvm_libc::x86
#endif // LIBC_TARGET_ARCH_IS_X86_64
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
index 5b211a47e4867..f26944ca48ab7 100644
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -11,12 +11,10 @@
#include "src/__support/CPP/bit.h"
#include "src/__support/CPP/cstddef.h"
-#include "src/__support/CPP/limits.h" // cpp::numeric_limits
#include "src/__support/CPP/type_traits.h"
#include "src/__support/endian.h"
#include "src/__support/macros/attributes.h" // LIBC_INLINE
#include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN
-#include "src/__support/macros/properties/architectures.h"
#include <stddef.h> // size_t
#include <stdint.h> // intptr_t / uintptr_t
@@ -151,39 +149,6 @@ template <typename T> struct StrictIntegralType {
using MemcmpReturnType = StrictIntegralType<int32_t>;
using BcmpReturnType = StrictIntegralType<uint32_t>;
-// This implements the semantic of 'memcmp' returning a negative value when 'a'
-// is less than 'b', '0' when 'a' equals 'b' and a positive number otherwise.
-LIBC_INLINE MemcmpReturnType cmp_uint32_t(uint32_t a, uint32_t b) {
- // We perform the
diff erence as an uint64_t.
- const int64_t
diff = static_cast<int64_t>(a) - static_cast<int64_t>(b);
- // And reduce the uint64_t into an uint32_t.
- // TODO: provide a detailed explanation.
- return static_cast<int32_t>((
diff >> 1) | (
diff & 0xFFFF));
-}
-
-// Returns a negative value if 'a' is less than 'b' and a positive value
-// otherwise. This implements the semantic of 'memcmp' when we know that 'a' and
-// 'b'
diff er.
-LIBC_INLINE MemcmpReturnType cmp_neq_uint64_t(uint64_t a, uint64_t b) {
-#if defined(LIBC_TARGET_ARCH_IS_X86_64)
- // On x86, we choose the returned values so that they are just one unit appart
- // as this allows for better code generation.
- static constexpr int32_t POSITIVE = cpp::numeric_limits<int32_t>::max();
- static constexpr int32_t NEGATIVE = cpp::numeric_limits<int32_t>::min();
- static_assert(cpp::bit_cast<uint32_t>(NEGATIVE) -
- cpp::bit_cast<uint32_t>(POSITIVE) ==
- 1);
-#else
- // On RISC-V we simply use '1' and '-1' as it leads to branchless code.
- // On ARMv8, both strategies lead to the same performance.
- static constexpr int32_t POSITIVE = 1;
- static constexpr int32_t NEGATIVE = -1;
-#endif
- static_assert(POSITIVE > 0);
- static_assert(NEGATIVE < 0);
- return a < b ? NEGATIVE : POSITIVE;
-}
-
// Loads bytes from memory (possibly unaligned) and materializes them as
// type.
template <typename T> LIBC_INLINE T load(CPtr ptr) {
@@ -315,16 +280,6 @@ void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
}
-template <size_t SIZE> struct AlignHelper {
- AlignHelper(CPtr ptr) : offset_(distance_to_next_aligned<SIZE>(ptr)) {}
-
- LIBC_INLINE bool not_aligned() const { return offset_ != SIZE; }
- LIBC_INLINE uintptr_t offset() const { return offset_; }
-
-private:
- uintptr_t offset_;
-};
-
} // namespace __llvm_libc
#endif // LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
diff --git a/libc/src/string/memory_utils/x86_64/memcmp_implementations.h b/libc/src/string/memory_utils/x86_64/memcmp_implementations.h
index 7502a8d98f53e..26de1d9b7b971 100644
--- a/libc/src/string/memory_utils/x86_64/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/x86_64/memcmp_implementations.h
@@ -18,76 +18,79 @@ namespace __llvm_libc {
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) {
- return generic::Memcmp<uint64_t>::loop_and_tail_align_above(384, p1, p2,
- count);
+ if (LIBC_UNLIKELY(count >= 384)) {
+ if (auto value = generic::Memcmp<16>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ }
+ return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
}
-#if defined(__SSE4_1__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
-inline_memcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) {
- return generic::Memcmp<__m128i>::loop_and_tail_align_above(384, p1, p2,
- count);
+inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) {
+ if (LIBC_UNLIKELY(count >= 384)) {
+ if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
+ return value;
+ align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+ }
+ return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
}
-#endif // __SSE4_1__
-#if defined(__AVX2__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
+ return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
- return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
- return generic::Memcmp<__m256i>::loop_and_tail_align_above(384, p1, p2,
- count);
+ return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
+ if (count <= 128)
+ return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
+ if (LIBC_UNLIKELY(count >= 384)) {
+ if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
+ return value;
+ align_to_next_boundary<32, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
}
-#endif // __AVX2__
-#if defined(__AVX512BW__)
[[maybe_unused]] LIBC_INLINE MemcmpReturnType
inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) {
if (count <= 32)
- return generic::Memcmp<__m128i>::head_tail(p1, p2, count);
+ return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
if (count <= 64)
- return generic::Memcmp<__m256i>::head_tail(p1, p2, count);
+ return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
if (count <= 128)
- return generic::Memcmp<__m512i>::head_tail(p1, p2, count);
- return generic::Memcmp<__m512i>::loop_and_tail_align_above(384, p1, p2,
- count);
+ return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
+ if (LIBC_UNLIKELY(count >= 384)) {
+ if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
+ return value;
+ align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+ }
+ return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
}
-#endif // __AVX512BW__
LIBC_INLINE MemcmpReturnType inline_memcmp_x86(CPtr p1, CPtr p2, size_t count) {
+
if (count == 0)
return MemcmpReturnType::ZERO();
if (count == 1)
- return generic::Memcmp<uint8_t>::block(p1, p2);
+ return generic::Memcmp<1>::block(p1, p2);
if (count == 2)
- return generic::Memcmp<uint16_t>::block(p1, p2);
+ return generic::Memcmp<2>::block(p1, p2);
if (count == 3)
- return generic::MemcmpSequence<uint16_t, uint8_t>::block(p1, p2);
- if (count == 4)
- return generic::Memcmp<uint32_t>::block(p1, p2);
- if (count == 5)
- return generic::MemcmpSequence<uint32_t, uint8_t>::block(p1, p2);
- if (count == 6)
- return generic::MemcmpSequence<uint32_t, uint16_t>::block(p1, p2);
- if (count == 7)
- return generic::Memcmp<uint32_t>::head_tail(p1, p2, 7);
- if (count == 8)
- return generic::Memcmp<uint64_t>::block(p1, p2);
+ return generic::Memcmp<3>::block(p1, p2);
+ if (count <= 8)
+ return generic::Memcmp<4>::head_tail(p1, p2, count);
if (count <= 16)
- return generic::Memcmp<uint64_t>::head_tail(p1, p2, count);
-#if defined(__AVX512BW__)
- return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
-#elif defined(__AVX2__)
- return inline_memcmp_x86_avx2_gt16(p1, p2, count);
-#elif defined(__SSE4_1__)
- return inline_memcmp_x86_sse41_gt16(p1, p2, count);
-#else
- return inline_memcmp_generic_gt16(p1, p2, count);
-#endif
+ return generic::Memcmp<8>::head_tail(p1, p2, count);
+ if constexpr (x86::kAvx512BW)
+ return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
+ else if constexpr (x86::kAvx2)
+ return inline_memcmp_x86_avx2_gt16(p1, p2, count);
+ else if constexpr (x86::kSse2)
+ return inline_memcmp_x86_sse2_gt16(p1, p2, count);
+ else
+ return inline_memcmp_generic_gt16(p1, p2, count);
}
-
} // namespace __llvm_libc
#endif // LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCMP_IMPLEMENTATIONS_H
diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp
index 376fbeb122b8c..e38edc54e8a9f 100644
--- a/libc/test/src/string/memory_utils/op_tests.cpp
+++ b/libc/test/src/string/memory_utils/op_tests.cpp
@@ -194,34 +194,35 @@ TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) {
}
using BcmpImplementations = testing::TypeList<
-#ifdef LIBC_TARGET_ARCH_IS_X86_64
-#ifdef __SSE4_1__
- generic::Bcmp<__m128i>,
-#endif // __SSE4_1__
+#ifdef __SSE2__
+ x86::sse2::Bcmp<16>, //
+ x86::sse2::Bcmp<32>, //
+ x86::sse2::Bcmp<64>, //
+ x86::sse2::Bcmp<128>, //
+#endif
#ifdef __AVX2__
- generic::Bcmp<__m256i>,
-#endif // __AVX2__
+ x86::avx2::Bcmp<32>, //
+ x86::avx2::Bcmp<64>, //
+ x86::avx2::Bcmp<128>, //
+#endif
#ifdef __AVX512BW__
- generic::Bcmp<__m512i>,
-#endif // __AVX512BW__
-
-#endif // LIBC_TARGET_ARCH_IS_X86_64
+ x86::avx512bw::Bcmp<64>, //
+ x86::avx512bw::Bcmp<128>, //
+#endif
#ifdef LIBC_TARGET_ARCH_IS_AARCH64
aarch64::Bcmp<16>, //
- aarch64::Bcmp<32>,
-#endif // LIBC_TARGET_ARCH_IS_AARCH64
+ aarch64::Bcmp<32>, //
+#endif
#ifdef LLVM_LIBC_HAS_UINT64
- generic::Bcmp<uint64_t>, //
+ generic::Bcmp<8>, //
#endif
- generic::Bcmp<uint8_t>, //
- generic::Bcmp<uint16_t>, //
- generic::Bcmp<uint32_t>, //
- generic::BcmpSequence<uint8_t, uint8_t>, //
- generic::BcmpSequence<uint8_t, uint8_t, uint8_t>, //
- generic::BcmpSequence<uint16_t, uint8_t>, //
- generic::BcmpSequence<uint32_t, uint8_t>, //
- generic::BcmpSequence<uint32_t, uint16_t>, //
- generic::BcmpSequence<uint32_t, uint16_t, uint8_t>>;
+ generic::Bcmp<1>, //
+ generic::Bcmp<2>, //
+ generic::Bcmp<4>, //
+ generic::Bcmp<16>, //
+ generic::Bcmp<32>, //
+ generic::Bcmp<64> //
+ >;
// Adapt CheckBcmp signature to op implementation signatures.
template <auto FnImpl>
@@ -246,8 +247,7 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
ASSERT_TRUE((CheckBcmp<BlockImpl>(span1, span2, kSize)));
}
}
- if constexpr (has_head_tail<Impl>::value) {
- // Test head tail operations from kSize to 2 * kSize.
+ { // Test head tail operations from kSize to 2 * kSize.
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
Buffer Buffer1(2 * kSize);
Buffer Buffer2(2 * kSize);
@@ -258,8 +258,7 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
ASSERT_TRUE((CheckBcmp<HeadTailImpl>(span1, span2, size)));
}
}
- if constexpr (has_loop_and_tail<Impl>::value) {
- // Test loop operations from kSize to 3 * kSize.
+ { // Test loop operations from kSize to 3 * kSize.
if constexpr (kSize > 1) {
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
Buffer Buffer1(3 * kSize);
@@ -275,27 +274,31 @@ TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
}
using MemcmpImplementations = testing::TypeList<
-#ifdef LIBC_TARGET_ARCH_IS_X86_64
#ifdef __SSE2__
- generic::Memcmp<__m128i>, //
+ x86::sse2::Memcmp<16>, //
+ x86::sse2::Memcmp<32>, //
+ x86::sse2::Memcmp<64>, //
+ x86::sse2::Memcmp<128>, //
#endif
#ifdef __AVX2__
- generic::Memcmp<__m256i>, //
+ x86::avx2::Memcmp<32>, //
+ x86::avx2::Memcmp<64>, //
+ x86::avx2::Memcmp<128>, //
#endif
#ifdef __AVX512BW__
- generic::Memcmp<__m512i>, //
+ x86::avx512bw::Memcmp<64>, //
+ x86::avx512bw::Memcmp<128>, //
#endif
-#endif // LIBC_TARGET_ARCH_IS_X86_64
#ifdef LLVM_LIBC_HAS_UINT64
- generic::Memcmp<uint64_t>, //
+ generic::Memcmp<8>, //
#endif
- generic::Memcmp<uint8_t>, //
- generic::Memcmp<uint16_t>, //
- generic::Memcmp<uint32_t>, //
- generic::MemcmpSequence<uint8_t, uint8_t>, //
- generic::MemcmpSequence<uint8_t, uint8_t, uint8_t>, //
- generic::MemcmpSequence<uint16_t, uint8_t>, //
- generic::MemcmpSequence<uint32_t, uint16_t, uint8_t> //
+ generic::Memcmp<1>, //
+ generic::Memcmp<2>, //
+ generic::Memcmp<3>, //
+ generic::Memcmp<4>, //
+ generic::Memcmp<16>, //
+ generic::Memcmp<32>, //
+ generic::Memcmp<64> //
>;
TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
@@ -311,8 +314,7 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
ASSERT_TRUE((CheckMemcmp<BlockImpl>(span1, span2, kSize)));
}
}
- if constexpr (has_head_tail<Impl>::value) {
- // Test head tail operations from kSize to 2 * kSize.
+ { // Test head tail operations from kSize to 2 * kSize.
static constexpr auto HeadTailImpl = CmpAdaptor<Impl::head_tail>;
Buffer Buffer1(2 * kSize);
Buffer Buffer2(2 * kSize);
@@ -323,8 +325,7 @@ TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
ASSERT_TRUE((CheckMemcmp<HeadTailImpl>(span1, span2, size)));
}
}
- if constexpr (has_loop_and_tail<Impl>::value) {
- // Test loop operations from kSize to 3 * kSize.
+ { // Test loop operations from kSize to 3 * kSize.
if constexpr (kSize > 1) {
static constexpr auto LoopImpl = CmpAdaptor<Impl::loop_and_tail>;
Buffer Buffer1(3 * kSize);
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 68c4619bdb767..0df29f5f3b8c5 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1857,7 +1857,6 @@ libc_support_library(
":__support_cpp_array",
":__support_cpp_bit",
":__support_cpp_cstddef",
- ":__support_cpp_limits",
":__support_cpp_type_traits",
":__support_macros_attributes",
":__support_macros_config",
More information about the libc-commits
mailing list