<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/127054>127054</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
x86/avx512 vperm optimizations can produce incorrect permutations
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
jlainema
</td>
</tr>
</table>
<pre>
// this code should not report mismatches when run on a cpu with AVX512F support:
#include <cstdint>
#include <cstdlib>
#include <cstdio>
#include <immintrin.h>
alignas(64) static const __m512i swizzles_512[10] = {
_mm512_set_epi32( 0, 1, 2, 3, 4, 5, 6, 7,16,17,18,19,20,21,22,23),
_mm512_set_epi32( 8, 9,10,11,12,13,14,15,24,25,26,27,28,29,30,31),
_mm512_set_epi32( 0, 1, 2, 3,16,17,18,19, 8, 9,10,11,24,25,26,27),
_mm512_set_epi32( 4, 5, 6, 7,20,21,22,23,12,13,14,15,28,29,30,31),
_mm512_set_epi32( 0, 1,16,17, 4, 5,20,21, 8, 9,24,25,12,13,28,29),
_mm512_set_epi32( 2, 3,18,19, 6, 7,22,23,10,11,26,27,14,15,30,31),
_mm512_set_epi32( 0,16, 2,18, 4,20, 6,22, 8,24,10,26,12,28,14,30),
_mm512_set_epi32( 1,17, 3,19, 5,21, 7,23, 9,25,11,27,13,29,15,31),
_mm512_set_epi32( 0,16, 1,17, 2,18, 3,19, 4,20, 5,21, 6,22, 7,23),
_mm512_set_epi32( 8,24, 9,25,10,26,11,27,12,28,13,29,14,30,15,31)
};
alignas(64) uint32_t swizzles[10][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7,16,17,18,19,20,21,22,23},{ 8, 9,10,11,12,13,14,15,24,25,26,27,28,29,30,31},
{ 0, 1, 2, 3,16,17,18,19, 8, 9,10,11,24,25,26,27},{ 4, 5, 6, 7,20,21,22,23,12,13,14,15,28,29,30,31},
{ 0, 1,16,17, 4, 5,20,21, 8, 9,24,25,12,13,28,29},{ 2, 3,18,19, 6, 7,22,23,10,11,26,27,14,15,30,31},
{ 0,16, 2,18, 4,20, 6,22, 8,24,10,26,12,28,14,30},{ 1,17, 3,19, 5,21, 7,23, 9,25,11,27,13,29,15,31},
{ 0,16, 1,17, 2,18, 3,19, 4,20, 5,21, 6,22, 7,23},{ 8,24, 9,25,10,26,11,27,12,28,13,29,14,30,15,31}};
void reorder_gather512(uint32_t *_a) {
__m512i a,b;
a = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[0]), _a, 4);
b = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[1]), _a, 4);
_mm512_store_epi32(_a,_mm512_min_epi32(a,b));
_mm512_store_epi32(_a+16,_mm512_max_epi32(a,b));
a = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[2]), _a, 4);
b = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[3]), _a, 4);
_mm512_store_epi32(_a,_mm512_min_epi32(a,b));
_mm512_store_epi32(_a+16,_mm512_max_epi32(a,b));
a = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[4]), _a, 4);
b = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[5]), _a, 4);
_mm512_store_epi32(_a,_mm512_min_epi32(a,b));
_mm512_store_epi32(_a+16,_mm512_max_epi32(a,b));
a = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[6]), _a, 4);
b = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[7]), _a, 4);
_mm512_store_epi32(_a,_mm512_min_epi32(a,b));
_mm512_store_epi32(_a+16,_mm512_max_epi32(a,b));
a = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[8]), _a, 4);
b = _mm512_i32gather_epi32(_mm512_load_epi32(swizzles[9]), _a, 4);
_mm512_store_epi32(_a,a);
_mm512_store_epi32(_a+16,b);
}
#define PERM_ACCESS(n) swizzles_512[n]
// #define PERM_ACCESS(n) _mm512_load_epi32(swizzles[n])
void reorder_perm512(uint32_t *_a) {
__m512i a = _mm512_load_epi32(_a), b = _mm512_load_epi32(_a+16);
__m512i x,y;
x = _mm512_permutex2var_epi32(a,PERM_ACCESS(0),b);
y = _mm512_permutex2var_epi32(a,PERM_ACCESS(1),b);
a = _mm512_min_epi32(x,y);
b = _mm512_max_epi32(x,y);
x = _mm512_permutex2var_epi32(a,PERM_ACCESS(2),b);
y = _mm512_permutex2var_epi32(a,PERM_ACCESS(3),b);
a = _mm512_min_epi32(x,y);
b = _mm512_max_epi32(x,y);
x = _mm512_permutex2var_epi32(a,PERM_ACCESS(4),b);
y = _mm512_permutex2var_epi32(a,PERM_ACCESS(5),b);
a = _mm512_min_epi32(x,y);
b = _mm512_max_epi32(x,y);
x = _mm512_permutex2var_epi32(a,PERM_ACCESS(6),b);
y = _mm512_permutex2var_epi32(a,PERM_ACCESS(7),b);
a = _mm512_min_epi32(x,y);
b = _mm512_max_epi32(x,y);
_mm512_store_epi32(_a,_mm512_permutex2var_epi32(a,PERM_ACCESS(8),b));
_mm512_store_epi32(_a+16,_mm512_permutex2var_epi32(a,PERM_ACCESS(9),b));
}
int main() {
alignas(64) uint32_t aval[32], bval[32];
for(unsigned i=0; i<32; i++) aval[i] = bval[i] = rand();
reorder_perm512(aval);
reorder_gather512(bval);
for (unsigned i=0; i<10; i++) {
__m512i dx = _mm512_sub_epi32(swizzles_512[i], _mm512_load_epi32(swizzles[i]));
if (_mm512_reduce_add_epi32(dx))
printf("index mismatch for register %d\n", i);
}
for (unsigned i=0; i<32; i++) {
if (aval[i] != bval[i]) {
printf("gather/permute mismatch %08x != %08x (@%d)\n", aval[i], bval[i], i);
}
}
return 0;
}
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzcWU2P4jgQ_TXmYk0rLhOSHDjwMdxWWu1Iq70hJzHgUeIgx6GZ-fUrO4mdNDT09tCrUUvIJLGrXr0Xu8oGVtdiLzmfo3CJwvWENfpQqfn3ggnJSzZJq_zHHMEGwQbrg6hxVuUc14eqKXIsK40VP1ZK41LUJdPZgdf4-cAlVo3ElcQMZ8cGPwt9wIu__wkJbHDdHI0FogsU2A9QIbOiyTlGdJXVOhdSI_r1alch0te6RHWtR5SlkFoJ-XTouoMFK8ReshpBPJsiSHCtmRYZzipZa7zdliEBgetn8fNnwettSACFSxKgcI0RXWMULVGwwHhbmoHbmustPwoKCGIcIFhhYhowDTXN1DShaWamiRCsiLki9io2TYJgBcYWjC0YW6AIzONXoYwhNiOIMSTGkBhDYkCJASUGFMwV2CsDCgYUjK31To0tJXegrrC6xuBaSFfgb0JdEeuKMNeZvpeV5-LhPahn5bl4eAd6E8rL5sXy_DwrL5t7V57f21lZQhbU4llWlpAFtXiWlSVkQS2eZWUJWVCDdxOKONmoYxU62aKeVatd6KhFTrvEUXs7Kw_q-Xl4z9QH4jlHb11XVphB3F4iz8CL5bn0sg1ZBQsUrRFdXk89jZCawla7dNOnGvM9e5lyULR8bIqJ1kaKaPnYbNJ6vRHw-7OHC_ihicIFPIr3EXnBxfvQFHAp8MOWvAv4oav7QuCHLebRDH7Yuo3Ww0V7qkSOFa9UztV2z_SBK7MpgNitXgSLLTPr2W0Oul0EQ7BKW0cYM7uYu6QjKLSuXOrpOoqK5e7ZIC3YrGDjxQbL6pMgusTYOk9_yTl5xblx3WdJXSnu_ZlRXU8ppHveEm4r4h3zpZ0EvQ92vuHjF6WDj5SOfmrpph8pXfippZt9pHTRp5Yu_kjpkndJxwbk7iiUuqGmivSH3JzvhOT4z69__bFdrFZfv31DEEt7_hwfNaWJz5jYQ_dNyztMZcf0sooduSrv1DBXwoZij5DscCNgenOIFcVr17k9I1j96B-ehw5MbI3mZzgxNZpiYwG640k68P3jHW7IpZsR4-FaaWP2A0e8hwvi5cD38IPH8KO_K7_pY_iFvyu_2WP4Rf8Dv7sF442xxoNY_2M5eSNCcgXBJVkhNS6ZkMjs7_1e_NUzNzuxwmzg2u3hCqfD-37zj_GuUiZPSvtDaY4FouvA1CKB6IpCewVL-0l6n6I_uqcv7hWTeRvfAOEyK1s3TsJrZ490OKKPE78aKAleBNqr4zNyPprldZNelJSuQIlOrzvVR_R11s1HjMUO-wKteN5kfMtyb56fO4t2OMZHJaTeWcVAyJyf3S_Nlq_ie1FrboiHOQpXErWnQzFAdfPjjkQX79JL1IY9erdAXrzescU48vbFIdh009yTQBAG8bl35-5iNA0sJUgGrNgQbTUGH3PuWY_YK64bJXHgls0kn9M8oQmb8DmJaDINCaVkcpiTPMhS2M1mNJnmNGd0OqOMhLADFsaE7iZiDgGEARBKIAwhfuLpNJqFuziJ-SyaJgxNA14yUTwVxal8qtR-Iuq64XMCURBOJwVLeVHbPx0AJH_GttfQDNcTNTdGX9JmX6NpUIha196NFrrg87NJrxt2OocE8MmoiqujFqX4ybSoZI0zJvFRVWaCYSGzSimeadzK3w6ZNKqYH7Q-1oh2m6290IcmfcqqEsHGIHZfX46q-s4zjWBj46wRbDoipzn8GwAA__8qwpmh">