<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/57940>57940</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[X86] Terrible codegen for splat2 i32 memory copies
</td>
</tr>
<tr>
<th>Labels</th>
<td>
backend:X86,
vectorization,
missed-optimization
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
RKSimon
</td>
</tr>
</table>
<pre>
Pulled out of Issue #41369 as many of these regressions are different problems
https://gcc.godbolt.org/z/TznK9P9je
```
#include <x86intrin.h>
template <typename T, int N>
void splatN_copy(size_t n, T* __restrict dst, const T* __restrict src) {
for (size_t i = 0; i < n; ++i) {
auto s = src[i];
for (int j = 0; j != N; ++j) {
dst[N*i+j] = s;
}
}
}
void splat2_copy32(size_t h, size_t w, unsigned* __restrict dst, const unsigned* __restrict src) {
splatN_copy<unsigned, 2>(h * w, dst, src);
}
```
The inner AVX1 loops is working with <2 x i32> vectors instead of <4 x i32>
```
.LBB0_8: # =>This Inner Loop Header: Depth=1
vmovsd (%rcx,%rsi,4), %xmm0 # xmm0 = mem[0],zero
vmovsd 16(%rcx,%rsi,4), %xmm2 # xmm2 = mem[0],zero
vmovsd 8(%rcx,%rsi,4), %xmm1 # xmm1 = mem[0],zero
vmovsd 24(%rcx,%rsi,4), %xmm3 # xmm3 = mem[0],zero
vpermilps $80, %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1]
vpermilps $80, %xmm1, %xmm4 # xmm4 = xmm1[0,0,1,1]
vpermilps $80, %xmm2, %xmm2 # xmm2 = xmm2[0,0,1,1]
vpermilps $80, %xmm3, %xmm1 # xmm1 = xmm3[0,0,1,1]
vmovups %xmm0, (%rdx,%rsi,8)
vmovups %xmm4, 16(%rdx,%rsi,8)
vmovups %xmm2, 32(%rdx,%rsi,8)
vmovups %xmm1, 48(%rdx,%rsi,8)
addq $8, %rsi
cmpq %rsi, %rax
jne .LBB0_8
jmp .LBB0_9
```
We should be able to manage:
```
.LBB0_8: # =>This Inner Loop Header: Depth=1
vmovups (%rcx,%rsi,4), %xmm0
vmovups 16(%rcx,%rsi,4), %xmm2
vpermilps $??, %xmm0, %xmm1 # xmm1 = xmm0[2,2,3,3]
vpermilps $80, %xmm0, %xmm0 # xmm0 = xmm1[0,0,1,1]
vpermilps $??, %xmm2, %xmm3 # xmm3 = xmm2[2,2,3,3]
vpermilps $80, %xmm2, %xmm2 # xmm2 = xmm3[0,0,1,1]
vmovups %xmm0, (%rdx,%rsi,8)
vmovups %xmm1, 16(%rdx,%rsi,8)
vmovups %xmm2, 32(%rdx,%rsi,8)
vmovups %xmm3, 48(%rdx,%rsi,8)
addq $8, %rsi
cmpq %rsi, %rax
jne .LBB0_8
jmp .LBB0_9
```
or even something similar with 256-bit vectors (although probably not worth it).
But its even worse for AVX2 loops:
```
.LBB0_8: # =>This Inner Loop Header: Depth=1
vpermpd $216, (%rcx,%rsi,4), %ymm0 # ymm0 = mem[0,2,1,3]
vpermpd $216, 32(%rcx,%rsi,4), %ymm2 # ymm2 = mem[0,2,1,3]
vpermpd $216, 64(%rcx,%rsi,4), %ymm3 # ymm3 = mem[0,2,1,3]
vpermpd $216, 96(%rcx,%rsi,4), %ymm4 # ymm4 = mem[0,2,1,3]
vpermilps $80, %ymm0, %ymm1 # ymm1 = ymm0[0,0,1,1,4,4,5,5]
vpermilps $250, %ymm0, %ymm0 # ymm0 = ymm0[2,2,3,3,6,6,7,7]
vpermilps $250, %ymm2, %ymm5 # ymm5 = ymm2[2,2,3,3,6,6,7,7]
vmovups %ymm0, 32(%rdx,%rsi,8)
vpermilps $80, %ymm2, %ymm0 # ymm0 = ymm2[0,0,1,1,4,4,5,5]
vmovups %ymm1, (%rdx,%rsi,8)
vmovups %ymm5, 96(%rdx,%rsi,8)
vpermilps $250, %ymm4, %ymm2 # ymm2 = ymm4[2,2,3,3,6,6,7,7]
vmovups %ymm0, 64(%rdx,%rsi,8)
vpermilps $80, %ymm3, %ymm0 # ymm0 = ymm3[0,0,1,1,4,4,5,5]
vpermilps $250, %ymm3, %ymm3 # ymm3 = ymm3[2,2,3,3,6,6,7,7]
vmovups %ymm0, 128(%rdx,%rsi,8)
vpermilps $80, %ymm4, %ymm0 # ymm0 = ymm4[0,0,1,1,4,4,5,5]
vmovups %ymm3, 160(%rdx,%rsi,8)
vmovups %ymm2, 224(%rdx,%rsi,8)
vmovups %ymm0, 192(%rdx,%rsi,8)
addq $32, %rsi
cmpq %rsi, %rax
jne .LBB0_8
jmp .LBB0_9
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzNWN1vozgQ_2vIi9UIbCDwkIcmudWtdlWt7qq9fasIOIm7gFnspKV__c2YEPJBEppVT1fh1tjj-Xm-p8xlUo2_rdOUJ0SuNZEL8lmpNScWZa7D_JBEimRRXuGOXnHFScmXJVdKyFyRqOQkEYsFL3muSVHKecozZdkzy76vf6-0LpTF7i36CZ5lHA-XMpnLVA9luYSVNxiPb_mX8Fv4zPcPWr69fepXykQep-sErsamr4Evcl2KfLiy2B_7xzTPijTShkpXBc-jjJNHi04JHCAPO-qNFAlRSPrwFMuismigxBt_0iRHYjhxT56eQFBAiTVJlMblGITWJ5uqjC0aEms0qXkT-FnIkrQ8BVxnRmyLTcx0CiAwtegEHnFyFn-itZZEmWPI3gMybwaHDqm2KCjacwsBU-rg20OL8tyJQoxg3uQBJBKGypvVmCdQ1mjWLuxe2smRVqnRKqOtDlaowO38BefrXIllzpNLqj5D06XxfWOyaXtwSihanQYrglwM9BajZrMTtRXm0PceVxy8J-cluf_-wyGplIUiQpEXWf4U-ZK8CL1Cq1LySgRDNLLhsZYlUIEQPEoweoDA3RF04gy_Tib2UwDRguGHdgDKxxUgfTboXwGY_AnseIk0M15ocP-Z0-hgk8mNStAhLOqV8SuIiBMFlp26KCiIDAuvWWYbgHoC1s54Bk5go4fR6Rsv5RFHx-_BkzY8aS-eQQ-WTsPS6cWSuj14soYnu8Kz4GUmUrC0Rd3A3tPdWTWaTWRGpzgcM7zZVY5OO3Ubjm7D0bmFIz1rFrN5A0d21ipm8wJHsM3a8NtTnrFScmClAK3UecbFMzsX7HvKqMDkn_ecMqZwg6unoiT5ZVS0VQtSbLfirPhFdofMLHptNp9zTpowb5ayYrsUdmaFfzhRK7lOEzLnJIICS6AyQEmOlhzr6kclklon1zPJ8Zl-uaLT3yz2CZ-uSDv1OYw0tDEOZkYPL74Yu70j7fiee9F2lF220fbue16M34-NNuc_jDb2_4s2aKj4hudEyYzrFVZ3JcBEUVlXeer5d3Ohd_Udrh6lGsJzuTLtL8RnRXKpsTcAcgFtRjjc75Am0GQLrWoQIIKGGps4aCxo3Vh8WEyjrxXYHbgUzTu9Gt1VEyTVcZ9Qe7PT5c0HCDuHuIRBGwx6E4Z_tehXTVhWx0W_L0Z4NadVTemumtLdA-Mk8qs2Q1VNzquanFd1dRfmEmZ4ZpxBoF43xJF9q660Sqf-dozM6ANB26nXQHgNxGlGvADR5ovm5n2yzDnt0kuin_ZFF7R7cC_nvXkW1XHgWD1lOVCyezaGzOZvKXkXVTcomV1S8mnxusWF9yCOQnsL8RuiO_R6QToru3tJdvdmB2N1Ubbf6WPG3SntYcsONYTX46ypy6yJqw8pzAM-dnyfhR6l9miQjFkSsjAaaKFTPgaV_gh8_GzxyMtSYHscy4QvobpiXa2_ReA_3ZiQZVnBbiG4GqzLdHz0bQrq-3o-jGUGL2m6af7cQV1_hnIPrwI_jSmYeKPQtQersR2yYOFFicM4G3luFC9o5MztmNle7Ns8GaTRnKcKb2lROo_inzxPABGvTLdKo3UvId4iLWTeLmeAxpM7WWhoQNpNbzYQY2qDLkLKHN_zXX_IfRqMEsd3_Lm7cJ3Acm2eRSId4v3xQ9ugHBtR5uulgs1UKK3azUiZbyVGmcg_WkNHU47_-vK3yGQ-MFKPjcj_AuQGYyk">