[libc-commits] [libc] 9ca6e5b - [libc] Fix buggy AVX2 / AVX512 `memcmp` (#77081)
via libc-commits
libc-commits at lists.llvm.org
Thu Jan 11 02:45:41 PST 2024
Author: Guillaume Chatelet
Date: 2024-01-11T11:45:37+01:00
New Revision: 9ca6e5bb86963eed00108d7da57033691bc21dbc
URL: https://github.com/llvm/llvm-project/commit/9ca6e5bb86963eed00108d7da57033691bc21dbc
DIFF: https://github.com/llvm/llvm-project/commit/9ca6e5bb86963eed00108d7da57033691bc21dbc.diff
LOG: [libc] Fix buggy AVX2 / AVX512 `memcmp` (#77081)
Fixes #77080.
Added:
Modified:
libc/src/string/memory_utils/op_x86.h
libc/test/src/string/memcmp_test.cpp
Removed:
################################################################################
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
index 1d2ae5cd15409d..a6529a6d424a30 100644
--- a/libc/src/string/memory_utils/op_x86.h
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -129,7 +129,8 @@ LIBC_INLINE __m128i bytewise_reverse(__m128i value) {
8, 9, 10, 11, 12, 13, 14, 15));
}
LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) {
- return static_cast<uint16_t>(_mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
+ return static_cast<uint16_t>(
+ _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))));
}
template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m128i>(p1, offset);
@@ -180,15 +181,41 @@ template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
return _mm256_max_epu8(a, b);
}
-LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
- return _mm256_shuffle_epi8(value,
- _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15, //
- 16, 17, 18, 19, 20, 21, 22, 23, //
- 24, 25, 26, 27, 28, 29, 30, 31));
-}
LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) {
- return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value)));
+ // Bytewise comparison of 'max' and 'value'.
+ const __m256i little_endian_byte_mask = _mm256_cmpeq_epi8(max, value);
+ // Because x86 is little endian, bytes in the vector must be reversed before
+ // using movemask.
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
+ // When AVX512BMI is available we can completely reverse the vector through
+ // VPERMB __m256i _mm256_permutexvar_epi8( __m256i idx, __m256i a);
+ const __m256i big_endian_byte_mask =
+ _mm256_permutexvar_epi8(_mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
+ 8, 9, 10, 11, 12, 13, 14, 15, //
+ 16, 17, 18, 19, 20, 21, 22, 23, //
+ 24, 25, 26, 27, 28, 29, 30, 31),
+ little_endian_byte_mask);
+ // And turn the byte vector mask into an 'uint32_t' for direct scalar
+ // comparison.
+ return _mm256_movemask_epi8(big_endian_byte_mask);
+#else
+ // We can't byte-reverse '__m256i' in a single instruction with AVX2.
+ // '_mm256_shuffle_epi8' can only shuffle within each 16-byte lane
+ // leading to:
+ // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
+ // So we first shuffle each 16-byte lane leading to half-reversed vector mask.
+ const __m256i half_reversed = _mm256_shuffle_epi8(
+ little_endian_byte_mask, _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
+ 8, 9, 10, 11, 12, 13, 14, 15, //
+ 0, 1, 2, 3, 4, 5, 6, 7, //
+ 8, 9, 10, 11, 12, 13, 14, 15));
+ // Then we turn the vector into an uint32_t.
+ const uint32_t half_reversed_scalar = _mm256_movemask_epi8(half_reversed);
+ // And swap the lower and upper parts. This is optimized into a single `rorx`
+ // instruction.
+ return (half_reversed_scalar << 16) | (half_reversed_scalar >> 16);
+#endif
}
template <>
LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
@@ -198,7 +225,7 @@ LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) {
const auto le = big_endian_cmp_mask(vmax, b);
const auto ge = big_endian_cmp_mask(vmax, a);
static_assert(cpp::is_same_v<cpp::remove_cv_t<decltype(le)>, uint32_t>);
- return cmp_uint32_t(ge, le);
+ return cmp_neq_uint64_t(ge, le);
}
#endif // __AVX2__
@@ -210,19 +237,48 @@ template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
return _mm512_max_epu8(a, b);
}
-LIBC_INLINE __m512i bytewise_reverse(__m512i value) {
- return _mm512_shuffle_epi8(value,
- _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15, //
- 16, 17, 18, 19, 20, 21, 22, 23, //
- 24, 25, 26, 27, 28, 29, 30, 31, //
- 32, 33, 34, 35, 36, 37, 38, 39, //
- 40, 41, 42, 43, 44, 45, 46, 47, //
- 48, 49, 50, 51, 52, 53, 54, 55, //
- 56, 57, 58, 59, 60, 61, 62, 63));
-}
LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
- return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value));
+ // The AVX512BMI version is disabled due to bad codegen.
+ // https://github.com/llvm/llvm-project/issues/77459
+ // https://github.com/llvm/llvm-project/pull/77081
+ // TODO: Re-enable when clang version meets the fixed version.
+#if false && defined(__AVX512VBMI__)
+ // When AVX512BMI is available we can completely reverse the vector through
+ // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
+ const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
+ 8, 9, 10, 11, 12, 13, 14, 15, //
+ 16, 17, 18, 19, 20, 21, 22, 23, //
+ 24, 25, 26, 27, 28, 29, 30, 31, //
+ 32, 33, 34, 35, 36, 37, 38, 39, //
+ 40, 41, 42, 43, 44, 45, 46, 47, //
+ 48, 49, 50, 51, 52, 53, 54, 55, //
+ 56, 57, 58, 59, 60, 61, 62, 63);
+ // Then we compute the mask for equal bytes.
+ return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
+ _mm512_permutexvar_epi8(indices, value));
+#else
+ // We can't byte-reverse '__m512i' in a single instruction with __AVX512BW__.
+ // '_mm512_shuffle_epi8' can only shuffle within each 16-byte lane.
+ // So we only reverse groups of 8 bytes, these groups are necessarily within a
+ // 16-byte lane.
+ // zmm = | 16 bytes | 16 bytes | 16 bytes | 16 bytes |
+ // zmm = | <8> | <8> | <8> | <8> | <8> | <8> | <8> | <8> |
+ const __m512i indices = _mm512_set_epi8(56, 57, 58, 59, 60, 61, 62, 63, //
+ 48, 49, 50, 51, 52, 53, 54, 55, //
+ 40, 41, 42, 43, 44, 45, 46, 47, //
+ 32, 33, 34, 35, 36, 37, 38, 39, //
+ 24, 25, 26, 27, 28, 29, 30, 31, //
+ 16, 17, 18, 19, 20, 21, 22, 23, //
+ 8, 9, 10, 11, 12, 13, 14, 15, //
+ 0, 1, 2, 3, 4, 5, 6, 7);
+ // Then we compute the mask for equal bytes. In this mask the bits of each
+ // byte are already reversed but the byte themselves should be reversed, this
+ // is done by using a bswap instruction.
+ return __builtin_bswap64(
+ _mm512_cmpeq_epi8_mask(_mm512_shuffle_epi8(max, indices), //
+ _mm512_shuffle_epi8(value, indices)));
+
+#endif
}
template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) {
const auto a = load<__m512i>(p1, offset);
diff --git a/libc/test/src/string/memcmp_test.cpp b/libc/test/src/string/memcmp_test.cpp
index 03a0ac1c0ba655..ca7a5c7ce37023 100644
--- a/libc/test/src/string/memcmp_test.cpp
+++ b/libc/test/src/string/memcmp_test.cpp
@@ -37,6 +37,13 @@ TEST(LlvmLibcMemcmpTest, LhsAfterRhsLexically) {
EXPECT_GT(LIBC_NAMESPACE::memcmp(lhs, rhs, 2), 0);
}
+TEST(LlvmLibcMemcmpTest, Issue77080) {
+ // https://github.com/llvm/llvm-project/issues/77080
+ constexpr char lhs[35] = "1.069cd68bbe76eb2143a3284d27ebe220";
+ constexpr char rhs[35] = "1.0500185b5d966a544e2d0fa40701b0f3";
+ ASSERT_GE(LIBC_NAMESPACE::memcmp(lhs, rhs, 34), 1);
+}
+
// Adapt CheckMemcmp signature to memcmp.
static inline int Adaptor(cpp::span<char> p1, cpp::span<char> p2, size_t size) {
return LIBC_NAMESPACE::memcmp(p1.begin(), p2.begin(), size);
More information about the libc-commits
mailing list