<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/136574>136574</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[X86] Unnecessary sequences of 8 GPR `mov`s back and forth
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
dzaima
</td>
</tr>
</table>
<pre>
This C code (heavily reduced from real-world code implementing SIMD transpose), compiled via `-O3 -march=haswell`:
```c
#include <immintrin.h>
#include <stdint.h>
#define load(x) _mm256_loadu_si256((void *)(x))
#define store(x, v) _mm256_storeu_si256((void *)(x), v)
void f(char *p1, char *p2, char *p3, uint64_t x, uint64_t y, uint64_t z) {
while (1) {
uint64_t i = 0;
while (1) {
if (i >= x)
break;
uint64_t j = 4 * i ? 4 * i : x;
__m256i a = load(p2 + y * 5);
__m256i l0 = load(p2);
__m256i l1 = load(p2 + j + 3 * y);
__m256i b = l0 + l1;
__m256i l2 = load(p2 + y);
__m256i l3 = load(p3 + y + j);
__m256i c = l2 + l3;
__m256i l4 = load(p3 + 6 * y);
__m256i l5 = load(p2 + j + 7 * y);
__m256i d = l4 + l5;
store(p1 + j * z + 16 * z, _mm256_permute2x128_si256(a, b, 49));
store(p1 + j * z, _mm256_permute2x128_si256(c, d, 49));
i++;
}
}
}
```
results in this segment of assembly:
```asm
...
mov rcx, rdi
mov rdi, r8
mov r8, r15
mov r15, r14
mov r14, r11
mov r11, r9
mov r9, rdx
mov rdx, r10
mov r10, qword ptr [rsp - 8]
vmovdqu ymm1, ymmword ptr [r10 + rbp]
mov r10, rdx
mov rdx, r9
mov r9, r11
mov r11, r14
mov r14, r15
mov r15, r8
mov r8, rdi
mov rdi, rcx
...
```
which could be just:
```asm
mov rcx, qword ptr [rsp - 8]
vmovdqu ymm1, ymmword ptr [rcx + rbp]
```
https://godbolt.org/z/P66GdGzaM
Similar to #81391, but for GPRs, not SIMD registers (though SIMD is still involved).
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJykVk2P4ygQ_TXkUkpk44_YhxySyWS0h9GOdnalvUUYSEwPGDdg5-PXr8DpdJJ2eg7bshwXr96roqBoiLVi33C-QNkKZesJ6VytzYKdiVBkUml2WvxdCwtfgGrGAeGi5qQX8gSGs45yBjujFRhO5PSgjWSDn1Ct5Io3TjR7-PnH9zU4QxrbassRLhH2cqoVkjPoBQGUR9M_E5gqYmiNknVN7IFLifIIJUsULf1HeKg3cCIaKjufTvJFKCUaZ0Qzq1Hy9QNsHRONu8EY34mGg9SEIVwcES5hqxTO8q0f6rZW4CxHuEC46LVggPAyZBxc_XOrYp02fMC-QH-jFYDfig0cFC0DuEO4oDUx3quNQ43eLHxnJd7qROPydOvgeGed7qyzTwnNVyhaAhxqIcMKxrej8O4tACVriFASkGfuAGLnh733V084DnOAy19lOPl10YA7_Zegn_o5hFibm-8lHG85260vowASKJfFajEgvIJTIGU-amB8IMnonnX1fHCLR8RfwjsJIU5PiNXAi4KrjJ9lgcdyf5ZLcuecXCe6gpcnFDowBl2ZPMsiHRHOH6d39c6elmT-aUnYwEuHZLL3ZN46pI2vWks4h-94SOPsd-ylbVpuVOc4Psa4uDYP8Q6Vf6XlpQeHDMa1fydHvQP7KPe-vRFe-WcIgubrgAy_l_fbcTTQDLeddBZEA86flZbv_dEHegfEWq4qeXo4xohVNx0zm828pXQfTENDSxsmbnyuIBMBLMawIkBxdqcWZ8NoOsaI0wGMR8FwCJnyTq4ccjuO5jYkHkejapEHXw_aMGidAZStjG1hCgXK1jeEXumevXZwUirEPyl1x4mHvjNVe8_zgS5BPk-vHMPK8Tq81eDz6mWjYDa6UJdF-nRx6fHD7rjdcoda0Bqo7iSDisNLZ92n--thY_3_JaDH-yV46Ifaudb6jPAG4c1es0pLN9Nmj_DmjPDmR55_Y9_O5Pvg_lMoIYkBpwHhpIiTMgStOgc7beDbj7-stxvthpuE4XthHTfW_xdyte729QD41nNCShBNr2XPGcLlbMIWCSuTkkz4Ip6nGc6ibJ5P6kUe5_OS78p0RxnlOMvnJSMlYXFBKduV0UQscISzKMVxnKVlks_KPKZFkuyiXUxJkkUojbgiQs6k7JWf30RY2_FFnOTZPJ1IUnFpw8UK44YfIKAIY3_PMgtPmlbd3qI0ksI6-y7jhJPhRvZvkaNsDf80DafcWmJOYPlrxxvKrT9eCl8df3tSukd5ZKEi9BeQhvnKuXrSGbl4WA3h6q6aUa0Q3viAl59pa_QLpw7hTUjTIry5zKNf4P8CAAD__0j9u2s">