[libc-commits] [libc] [libc] Fix buggy AVX2 `memcmp` (PR #77081)
Guillaume Chatelet via libc-commits
libc-commits at lists.llvm.org
Tue Jan 9 05:28:23 PST 2024
================
@@ -210,19 +237,43 @@ template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
return _mm512_max_epu8(a, b);
}
-LIBC_INLINE __m512i bytewise_reverse(__m512i value) {
- return _mm512_shuffle_epi8(value,
- _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15, //
- 16, 17, 18, 19, 20, 21, 22, 23, //
- 24, 25, 26, 27, 28, 29, 30, 31, //
- 32, 33, 34, 35, 36, 37, 38, 39, //
- 40, 41, 42, 43, 44, 45, 46, 47, //
- 48, 49, 50, 51, 52, 53, 54, 55, //
- 56, 57, 58, 59, 60, 61, 62, 63));
-}
LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
- return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value));
+#if defined(__AVX512VBMI__)
+ // When AVX512BMI is available we can completely reverse the vector through
+ // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
+ const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
+ 8, 9, 10, 11, 12, 13, 14, 15, //
+ 16, 17, 18, 19, 20, 21, 22, 23, //
+ 24, 25, 26, 27, 28, 29, 30, 31, //
+ 32, 33, 34, 35, 36, 37, 38, 39, //
+ 40, 41, 42, 43, 44, 45, 46, 47, //
+ 48, 49, 50, 51, 52, 53, 54, 55, //
+ 56, 57, 58, 59, 60, 61, 62, 63);
+ // Then we compute the mask for equal bytes.
+ return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
----------------
gchatelet wrote:
It appears that this code is not well optimized by the compiler, using GPRs instead of vector registers.
https://github.com/llvm/llvm-project/issues/77459
The vector code is 21 cycles whether the GPR is 25 according to llvm-mca.
I'll benchmark both of them and report.
https://github.com/llvm/llvm-project/pull/77081
More information about the libc-commits
mailing list