<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/74380>74380</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Missing vectorization opportunity (SLPVectorizer)
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
dcci
</td>
</tr>
</table>
<pre>
The codegen for this is perfect in the function `cvt_8_32()` when it is outlined, but once clang inlines that function, it misses an optimization opportunity. (Testcase by Nick Terrell).
It looks like the SLPVectorizer can't recognize the pattern after inlining.
https://gcc.godbolt.org/z/PbGfe6da7
```
#include <immintrin.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
static uint64_t read64(uint8_t const* ptr)
{
uint64_t dst;
memcpy(&dst, ptr, 8);
return dst;
}
void cvt_8_32(uint32_t* op, uint8_t const* ints)
{
// When this is outlined it generates correct code
uint32_t out[16];
uint8_t in[16];
memcpy(in, ints, sizeof(in));
for (int i = 0; i < 16; ++i) {
out[i] = in[i];
}
memcpy(op, out, sizeof(out));
}
size_t bitunpack_less_than_8_to_32(
uint32_t* restrict op,
size_t nbElts,
uint8_t const* restrict ip,
size_t nbBits)
{
size_t const srcSize = (nbElts * nbBits + 7) / 8;
uint8_t const* const iend = ip + srcSize;
uint32_t* const oend = op + nbElts;
{
size_t const bytesPerLoop = nbBits;
uint8_t const* ilimit = iend - 7 - bytesPerLoop;
uint64_t const mask = ((1ull << nbBits) - 1) * 0x0101010101010101ULL;
while (ip < ilimit) {
uint8_t ints[16];
for (size_t i = 0; i < 16; i += 8) {
uint64_t const bits = read64(ip);
ip += bytesPerLoop;
uint64_t const bytes = _pdep_u64(bits, mask);
memcpy(ints + i, &bytes, 8);
}
if (0) {
// This generates the assembly I want
__m128i const intsV = _mm_loadu_si128((__m128i_u*)ints);
__m256i const loV = _mm256_cvtepu8_epi32(intsV);
__m256i const hiV =
_mm256_cvtepu8_epi32(_mm_srli_si128(intsV, 8));
_mm256_storeu_si256((__m256i_u*)op, loV);
_mm256_storeu_si256((__m256i_u*)(op + 8), hiV);
} else {
// When cvt_8_32 is inlined it generates bad code
cvt_8_32(op, ints);
}
op += 16;
}
}
return srcSize;
}
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJycVt1u4zYTfRr6ZhBDpmxZvvBFHK8_LJCvWGDT9FKgKNqeRiIFcpQ0efqCpOS_aNOiTGDLImfmzOGZIYVzeNBKrdliwxbbiejoaOy6khInpane109HBdJU6qA07I0FOqIDdNAqu1eSADXQUcG-05LQaGBZIl-pyIuUM54zvmJZAm9HpQHJ25mOatSqYvwByo7AaKlA1kIfALWfcUBHQSeHfh0SNOicciA0mJawwQ8Ropm2NZY6jfQ-BcbzJ-VICqegfIffUL7Ak7JW1TXjqylLtiy5j5_fCWpjXhzU-KJCAj8ffzwrScbih7IghWZ8SWCVNAeNH3FNK4iU1SD2pGyEi_pw5fhI1DqW3jO-Y3x3kHJ6MFVpapoae2B898H47kf5v73KKrG8NGRZ0v9fvuQpall3lQKWPmDToCaLenpk6bexFY6qSu2_mkZNX0xbn87FdPh0JAgldKgpmxeeE1Flc8Zz_yYvCKTRjhi_h5as3_Bou9zEh7Nh5Yilw1sAaFQj2_egkszP8Yfo4QGCbi6XWkWd1Vce2HJ7ifLVYAUXyvNRU14EXKb1Tj_BRU1uBG_cOvjDa3ZQ-6BaL8WD0soKUg6ksdbXgK-PM9Yhsjdii80sY4vtVTIDENSj0ydaMIo_oHwAhx_K7PvXqyuCfF2GCQIElm4hYekmPD7ALPPPjG8Y3yDjKzgn2o8IE9liG2wDKrwFdWL7DC-y6q0v0YXfV_BuNsqvLAhKpE63Qr4UtXKuoKPQRV6Q6fvGFcLLvbTKy1RS3NR-Xe9Tl9_qQNZn86uNP_nA9tPik6sNjqrjvCS4A2flT98fwvD8MZ5HHOBDRT-ef1gG9vkO8lE1nNBFv6h0FfejDdZ9nE-mAy_RynirExQTTXtarrdzM5p29FK-k3I_lH003kO6HdhIN_9ALNbYIJ3ihxzuYAl3Vy7Pfk6tIcZthHsZOGQ8n3V17TXsZXzaD7iDWSTyHpK_ktn13--Pj59Qvh2xVqE-2lAREeR1KZxLktxoUQ6jL7Werl9UG4Z6S7ehj8FIJ-xpDspIt-eO6vW4Go3rR1SCNxhn82bcRvM2IVzRVqotuhCxxNhdPPVfxb5oSr2c0ZsxngW_o017GOfegXtPXjLahk6rY_t98p333Gr98SucU01Zv8N3eBOaxs2LopnxHIcq0uSeY9JNU9RGVF3hcBbkxXjeLy46xu8ZXw0Hwq9IKIqGL7LBd22ee50XjX9fyFdSbZcXqsXQxELwf-_viAFpv_QXPn0aztZ4yqIP0tN_Gaz34MhY5bPmi-yUtQ87ZB37eG2e_5N1OAiCICKAB5_HF0oAVTsF46ftcHz7EzdeBm8O3FJUl4ftxXEfk_hqA88qvBjmVFKhcgdMl0uvfvT3kNtmfD7ihlvcpFqn1SpdiYlaz5bJbMnzRZ5NjutUKCnmyULN5nkmFnlerRZlKZbLtNynySyZ4JonPJ3xZM45T5NsOs9KKRbZTMllymfpis0T1Qisp3X92vhr5QSd69R6OU_zZFKLUtUuXOc51-oNwiTj3N_u7drb3JXdwbF5UqMjd_ZCSLVa_x-dQ32A1_4y_Ome7Sv46rLM-GrS2Xp9c_VFOnblVJqG8Z2P0X_dtdb8qSQxvgvIHOO7gPzvAAAA___wqo2a">