<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/61061>61061</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Unnecessary store and reload emitted after shuffle operation is vectorized on Clang on x86 with SSE2
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
johnplatts
</td>
</tr>
</table>
<pre>
Here is a snippet of C++ code that generates an unnecessary store and reload when optimizations are enabled on x86 with SSE2:
```
#include <stdint.h>
#include <emmintrin.h>
static inline __m128i SSE2TableLookupBytes(__m128i bytes, __m128i from) {
alignas(16) uint8_t result_bytes[16];
alignas(16) uint8_t u8_bytes[16];
alignas(16) uint8_t from_bytes[16];
_mm_store_si128(reinterpret_cast<__m128i*>(u8_bytes), bytes);
_mm_store_si128(reinterpret_cast<__m128i*>(from_bytes), from);
for(int i = 0; i < 16; i++)
result_bytes[i] = u8_bytes[from_bytes[i] & 15];
return _mm_load_si128(reinterpret_cast<const __m128i*>(result_bytes));
}
__m128i SSE2ShuffleI8ByConstantTest(__m128i vect) {
const __m128i idx = _mm_set_epi32(0x03020100, 0x07060504, 0x03020100, 0x03020100);
return SSE2TableLookupBytes(vect, idx);
}
```
Here is the assembly code that is generated when the above code is compiled with clang 15.0.0:
```
_Z27SSE2ShuffleI8ByConstantTestDv2_x: # @_Z27SSE2ShuffleI8ByConstantTestDv2_x
pshufd $16, %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,0]
movdqa %xmm0, -24(%rsp)
movaps -24(%rsp), %xmm0
retq
```
The movdqa and movaps operations are unnecessary in the code that is generated above as the result of the shuffle is already in the xmm0 register.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJycVU1zozgT_jXypSsuIcyHDxxie1LvW7W3ZC97oQS0jWZBYqTGcebXb0k4DM7Gqd11GZBK3Y-6n_6SzqmTRixYsmPJYSVHao0tvptWD50kcqvKNG_F_9AiKAcSnFbDgATmCHsmdkzsoDYNArWS4IQarSR0IDWMWmONzkn7Bo6MRZC6AYudkQ28tqjBDKR69VOSMtqBtAioZdVhA0bDJU_hVVELz8_fBIsfGT8w_shSfv1PWxErXXdjg8DivaNGaVq3LP722TH2vdJklV5KhLcjSaoGpTulEcqyj0SuwsUv3p7fjPlzHHZvhI6J_P24mvb7Wf5oTc_EFli2m2BBduqkpVeKUn8yKk15SWDRjR2VE0Kyi1KWHFj8rnVXbcz_tYq36Y7SVbXs-zKEp3QqEjkTuUWlCe1gkcpaOmLx_uoiE4-eOZHPloitZ2Bezwb9J9SFsRPuldGPJh-NZSJXmkABiw_AWbwLyz1EaVhPmelVr_zAR84VSw5BeUHqDVmTgEghSj5hDSzSaHXw0yf0V27WRjuCj87e2OPdXbiZHZaXLfPxuR2Pxw7_n-_e9h5WanpBR4u0PGNNt1kINwaAai7B8RAipBIHFQsmcn7hMRc84txTzy884ylP-Oa6uz2bd4uYv3Nyp24mw_b-_s-d_VDa4f3eeKhFkM5hX3Vvi4aj3Nxzrj0lCFbmjJOUclCbflC-qYRuUndSnyBK1nzN73WV8g-RfUH24SzKC4t9VjERA9vwf6QwZ6L_Da4dj40H2PiC3QMTyaXv-WIZwKdFfAgLluy8gH-i8E0Ot6i9OTc_JCzBHsSG-cRMrBt-1UNvznJw8LfThSE3wBbpxxdRemnx_W7f46_oZvCBmXv7ch6oKVB3AjnFT05Rn-rEzxu_cxPDYRZ1FmUzYwWmLJ6UI7TrVVPEzTbeyhUWUZplyTZLeLpqi41MZM15nm15s9nmKKI0wyavEtFUWZ3jShWCi5gLkUdJlIt0HdepPHqAZpvHSb1hG469VN2668792tjTSjk3YpFGPI1Wnaywc2GcCqHxFcIhE8JPV1t4nYdqPDm24Z1y5H6hkKIOi9-_mprYKwoEHQntzMXMs2fFV5mx6uc0RPch2z9O09Vou6IlGpyvAPHExNNJUTtW69r32ydv0vXzMFjzPRTuU3DEMfEUHP0rAAD__zoLfu0">