[libc-commits] [libc] [libc] Fix buggy AVX2 `memcmp` (PR #77081)

Tue Jan 9 05:28:23 PST 2024

================
@@ -210,19 +237,43 @@ template <> struct cmp_is_expensive<__m512i> : cpp::true_type {};
 LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) {
   return _mm512_max_epu8(a, b);
 }
-LIBC_INLINE __m512i bytewise_reverse(__m512i value) {
-  return _mm512_shuffle_epi8(value,
-                             _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
-                                             8, 9, 10, 11, 12, 13, 14, 15,   //
-                                             16, 17, 18, 19, 20, 21, 22, 23, //
-                                             24, 25, 26, 27, 28, 29, 30, 31, //
-                                             32, 33, 34, 35, 36, 37, 38, 39, //
-                                             40, 41, 42, 43, 44, 45, 46, 47, //
-                                             48, 49, 50, 51, 52, 53, 54, 55, //
-                                             56, 57, 58, 59, 60, 61, 62, 63));
-}
 LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) {
-  return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value));
+#if defined(__AVX512VBMI__)
+  // When AVX512BMI is available we can completely reverse the vector through
+  // VPERMB __m512i _mm512_permutexvar_epi8( __m512i idx, __m512i a);
+  const auto indices = _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7,         //
+                                       8, 9, 10, 11, 12, 13, 14, 15,   //
+                                       16, 17, 18, 19, 20, 21, 22, 23, //
+                                       24, 25, 26, 27, 28, 29, 30, 31, //
+                                       32, 33, 34, 35, 36, 37, 38, 39, //
+                                       40, 41, 42, 43, 44, 45, 46, 47, //
+                                       48, 49, 50, 51, 52, 53, 54, 55, //
+                                       56, 57, 58, 59, 60, 61, 62, 63);
+  // Then we compute the mask for equal bytes.
+  return _mm512_cmpeq_epi8_mask(_mm512_permutexvar_epi8(indices, max), //
----------------
gchatelet wrote:

It appears that this code is not well optimized by the compiler, using GPRs instead of vector registers.
https://github.com/llvm/llvm-project/issues/77459

The vector code is 21 cycles whether the GPR is 25 according to llvm-mca.
I'll benchmark both of them and report.

https://github.com/llvm/llvm-project/pull/77081