[libc-commits] [libc] [libc] Fix buggy AVX2 `memcmp` (PR #77081)
Craig Topper via libc-commits
libc-commits at lists.llvm.org
Fri Jan 5 10:07:49 PST 2024
================
@@ -181,11 +182,31 @@ LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) {
return _mm256_max_epu8(a, b);
}
LIBC_INLINE __m256i bytewise_reverse(__m256i value) {
- return _mm256_shuffle_epi8(value,
- _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
- 8, 9, 10, 11, 12, 13, 14, 15, //
- 16, 17, 18, 19, 20, 21, 22, 23, //
- 24, 25, 26, 27, 28, 29, 30, 31));
+ const __m256i indices = _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, //
+ 8, 9, 10, 11, 12, 13, 14, 15, //
+ 16, 17, 18, 19, 20, 21, 22, 23, //
+ 24, 25, 26, 27, 28, 29, 30, 31);
+#if defined(__AVX512VBMI__) && defined(__AVX512VL__)
+ // AVX512 allows full __m256i byte permutation.
+ // ymm = ymm[31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,
+ // 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
+ return _mm256_permutexvar_epi8(value, indices);
+#else
+ // We can't byte-reverse __m256i in a single instruction with AVX2.
+ // '_mm256_shuffle_epi8' can only shuffle within each xmm lane
+ // leading to:
+ // ymm = ymm[15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
+ // 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16]
+ const __m256i tmp = _mm256_shuffle_epi8(value, indices);
+ // Then we shuffle accross lanes using 64 bit values.
----------------
topperc wrote:
accross -> across
https://github.com/llvm/llvm-project/pull/77081
More information about the libc-commits
mailing list