<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/57661>57661</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Suboptimal x86 codegen with multiple stores to neighboring addresses
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
tellowkrinkle
</td>
</tr>
</table>
<pre>
Godbolt link: https://gcc.godbolt.org/z/7vcboTPW1
First, in test0, clang decides to calculate `i * 4 + 1`, `i * 4 + 2`, etc before doing the `<< 4` to index the `uint4` array, requiring the full calculation for all four stores. I'd expect it to first calculate `t0 = (i * 4) << 4`, then index with `t0 + 16`, `t0 + 32`, etc. This seems to be a missing optimization relating to integer overflow being UB, as changing `i` to an `unsigned long` fixes this codegen.
Second, in test1, clang notices that the index doesn't change in the inner loop, and hoists the calculation out to the outer loop. It somehow fails to include the final `<< 4` in this hoisting, resulting in the majority of the actual calculation still happening in the inner loop. This happens regardless of whether `i` is an `unsigned long` or `int`.
In addition, it hoists the four offsets separately, even though they're just constant offsets from each other. This makes it run out of registers, spilling some to the stack, while it could have just hoisted a single `t0 = (i * 4) << 4` out of the loop and indexed with `[dst + t0]`, `[dst + t0 + 16]`, etc, freeing up 3 registers for other things (gcc is guilty of this one too).
<details>
<summary>Full input and output</summary>
test0:
```cpp
typedef unsigned int uint4 __attribute__((vector_size(16)));
typedef unsigned int uint4_packed __attribute__((vector_size(16), aligned(1)));
static void sw64(uint4& a, uint4& b, uint4& c, uint4& d) {
uint4 t0 = __builtin_shufflevector(a, b, 0, 1, 4, 5);
uint4 t1 = __builtin_shufflevector(c, d, 0, 1, 4, 5);
uint4 t2 = __builtin_shufflevector(a, b, 2, 3, 6, 7);
uint4 t3 = __builtin_shufflevector(c, d, 2, 3, 6, 7);
a = t0;
b = t1;
c = t2;
d = t3;
}
void test0(uint4* __restrict__ dst, const char* __restrict__ src, int srcpitch, int i) {
uint4 v0 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 0]);
uint4 v1 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 1]);
uint4 v2 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 2]);
uint4 v3 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 3]);
sw64(v0, v1, v2, v3);
dst[i * 4 + 0] = v0;
dst[i * 4 + 1] = v1;
dst[i * 4 + 2] = v2;
dst[i * 4 + 3] = v3;
}
```
```asm
test0(unsigned int __vector(4)*, char const*, int, int): # @test0(unsigned int __vector(4)*, char const*, int, int)
movups xmm0, xmmword ptr [rsi]
movsxd rax, edx
movups xmm1, xmmword ptr [rsi + rax]
movups xmm2, xmmword ptr [rsi + 2*rax]
lea rax, [rax + 2*rax]
movups xmm3, xmmword ptr [rsi + rax]
movaps xmm4, xmm0
movlhps xmm4, xmm1 # xmm4 = xmm4[0],xmm1[0]
movaps xmm5, xmm2
movlhps xmm5, xmm3 # xmm5 = xmm5[0],xmm3[0]
unpckhpd xmm0, xmm1 # xmm0 = xmm0[1],xmm1[1]
unpckhpd xmm2, xmm3 # xmm2 = xmm2[1],xmm3[1]
lea eax, [4*rcx]
cdqe
shl rax, 4
movaps xmmword ptr [rdi + rax], xmm4
lea eax, [4*rcx + 1]
cdqe
shl rax, 4
movaps xmmword ptr [rdi + rax], xmm5
lea eax, [4*rcx + 2]
cdqe
shl rax, 4
movaps xmmword ptr [rdi + rax], xmm0
lea eax, [4*rcx + 3]
cdqe
shl rax, 4
movaps xmmword ptr [rdi + rax], xmm2
ret
```
test1:
```cpp
void test1(char* __restrict__ dst, const char* __restrict__ src, int srcpitch, int width, int height) {
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
uint4* vdst = reinterpret_cast<uint4*>(&dst[((y >> 4) << 8) + (x << 4)]);
test0(vdst, src, srcpitch, (y >> 2) & 3);
src += srcpitch * 4;
}
}
}
```
```asm
test1(char*, char const*, int, int, int): # @test1(char*, char const*, int, int, int)
push rbp
push r15
push r14
push r13
push r12
push rbx
mov dword ptr [rsp - 12], r8d # 4-byte Spill
test r8d, r8d
jle .LBB1_6
test ecx, ecx
jle .LBB1_6
movsxd rax, edx
shl edx, 2
lea r9, [rax + 2*rax]
movsxd r15, edx
mov ecx, ecx
mov qword ptr [rsp - 8], rcx # 8-byte Spill
xor r11d, r11d
xor r14d, r14d
.LBB1_3: # =>This Loop Header: Depth=1
mov ebx, r11d
and ebx, -256
mov r8d, r14d
and r8d, 12
lea r12d, [r8 + 1]
lea r13d, [r8 + 2]
mov ecx, r8d
or ecx, 3
mov rdx, qword ptr [rsp - 8] # 8-byte Reload
.LBB1_4: # Parent Loop BB1_3 Depth=1
movsxd rbx, ebx
lea r10, [rdi + rbx]
movups xmm0, xmmword ptr [rsi]
movups xmm1, xmmword ptr [rsi + rax]
movups xmm2, xmmword ptr [rsi + 2*rax]
movups xmm3, xmmword ptr [rsi + r9]
movaps xmm4, xmm0
movlhps xmm4, xmm1 # xmm4 = xmm4[0],xmm1[0]
movaps xmm5, xmm2
movlhps xmm5, xmm3 # xmm5 = xmm5[0],xmm3[0]
unpckhpd xmm0, xmm1 # xmm0 = xmm0[1],xmm1[1]
unpckhpd xmm2, xmm3 # xmm2 = xmm2[1],xmm3[1]
mov rbp, r8
shl rbp, 4
movaps xmmword ptr [rbp + r10], xmm4
mov rbp, r12
shl rbp, 4
movaps xmmword ptr [rbp + r10], xmm5
mov rbp, r13
shl rbp, 4
movaps xmmword ptr [rbp + r10], xmm0
mov rbp, rcx
shl rbp, 4
movaps xmmword ptr [rbp + r10], xmm2
add rsi, r15
add ebx, 16
dec rdx
jne .LBB1_4
inc r14d
add r11d, 16
cmp r14d, dword ptr [rsp - 12] # 4-byte Folded Reload
jne .LBB1_3
.LBB1_6:
pop rbx
pop r12
pop r13
pop r14
pop r15
pop rbp
ret
```
</details>
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJztGl1zqzb21zgvmpsxYBz7wQ9x0ux2pg-dve3so0cgYZTIiIuE49xf33MkhAGb2NmmtzOd9TiApPP9pSOTRLG31b8US5Q0RIriZRLdk9yYUsPDJHyC7zZNb7cO4lZVW5j5Dn93-zRRv_3632AyfZxM7931SVTaTMIHIgpiuDZTfE4lLbaE8VQwrolRJKUyrSU1nEzmU0Em4T2ZwXVNAhgjxmA6bKa5SUnCM1VxwpQAmia3JCbRA3zJDB6RvCgYP_i1WhTGLtCqom9IpeLfalF59KyWshVIqIIAeUJhLlN1RbQBZvqWkJ8n4R0j_FDy1BBhkE2GuvZ1MVMyiR5B5kUj_iRcko50yB6YFo2Ir8LkHg21nx_Vb6aijuogxW-50ERzvrNmTDihZCe0Rl1UacROfHc6VByVQQ3RGoZveUXUnleZVK-Ahiu_r5Es1STNwTs4g0ZvLEgLa7pCi23BGZGq2OJKJg7oQBQiVQyoFrdd53_lqSpYx_vB0fuFMiK1yNRYuzsLMMV1AaY1TgxuMe1qASJLpUorZcFIroQ22i52vaVq6wuchscGB_1liFY7noO-GRVSO0uksmbcuV0UVA5jxzIH5SwvMImLFl1La8tGtB19VpUwb0RldkxTU9N-CAEyBFBOy5IXHcyjUt6VDkQDky2tmORaI9XXnAN41ToEAEccohxQYeDac8XPBaGMCZTG-sN07WcjW2WZ5gajqaQVhK-0ucH3HGVV9TZHUJi7g2R7rjHQVaENLUyLmVVqRzhNc6JQXK_Tjr6An4FjVTv3gEagH3DnlUYeugTroFnQQd55QDp9wdXXXEiO6KmqJbid7hv-VgHQnRKMd3lVvnn-yAHtbiPJRh4Q8tk3idcM6GO6Qb2KH49J2FvxKdoCQEriLau4Tai6JNFRUVtGrF0wpIqtRiGhkKIzt7WQPn5gqAq0ggLRvQujB8YNRu0k-qmd0vVuR6s3mHrCkiWKEnRDfUBHeEStw6cjUDcaXCWOmhGKb79pWTbrbyVnPCNtfEFEEVs4yWZDjalEArm12YAK8N1DCVTVRovvHEZgkXDZfKP1JXqbEtwME9eShdyXlgROnTJyV4gdKC5krwQj-nUOQbBwVT-cE4o02lHSG6W9EbOxc9dQJvBxFmhibLNJ0G2i2Oi8zjLJnbzAy7KwlG1Y2KI3w0vcFbVDMLhA0MrFPkAwvFrCEC8RXuZ4uRshGF0t4SWC1FLC6OtMJm4y6E2mbjLsTTI3GR09fvfYdb31edNoeKffg-BQtSG8UrPZEOYaElu-cJepTiB0lbpNy-BjKUya-7EYCYq9Lzz3FccNtiorbjYp1ZiHjlU33gEOUxLjfI7c4rVnZMuWrTpnPbEPPpNPMM4n_Ew-4Tif6DP5RGf4NAVgb1Nnb3Nnb4N0Hw1BMTLidbfRREdYAffTC5BBCxlcgAxbyPACZNRCnot3X7UHQ6p33SIPWdAtuptNm7MzWzvvbS5AFriEaCawg_C3JR4AznwmIfhuNv08Nq0l8LNT-7rUhBx2O-s5uL-qipHSQIsTryst0NdDFH1ghFT0YLdjdhglGZwnaa2O6GdIe9TwHdQQ4_gMuuTU3hvJEIMe3sXoMIw-LittUGcN6vQEQuYA0oEIznnY-RihbBRa8HjtitODtWIzGuMfN9TDUf4eInqXf-z5xz3-0Vn-dVGmL3nJ_LgTQe9rOfVcsM8LeloG13AJr9Il9FzCHpfoLBcfOLwNHNzPqvTU6yn7xvszOpf23gTdbNRLvcBivcByCs2uFKothD9OtPgjooU_VLTpR0SLfqhog3yELffsnuKO7OPHhLbZCrAFPNdJ_ale61Uw0w5yLra5Oem-8FSFpz2AeLOJhfu0fXzwKDgGC9jvAHtA4XCkcLAUnAQ4fIcAftpOc2_Ph0DlTDvjgdoOxu347rSDIv8E396RdWGfwXsAcDieY2GvPO1z_Mfvx_vG9o2Bu8btsgsdizk56YbasKtSlAGV6jVbsxPotjfpDT7WsXRC6WLLcKFBIb0e5X8i3FOvrHVu0yUpRxaCeGxhNrYQjS2EY8xPuxp7Z_3-oCRfSBA2KV8t_D6FBpl9Sd4MJ1_xZ5c-LbSTZbJgDVp__Vlye7_9Zb0ONvPzyDx17Vd6-ADypd7N1z5csKfM8_3V8vr2yrEL4rFWkbyni1__dmr0hbc5VPaO0RejRj9AAXIuD5zV8T4CMWsgZg2Es2U0ngDtx2ZC9Agpb3-U-wV__Po3p4xXiPzISyx0j8GIGZLDecnw56YOwJcwPvWrE30xkHxIoQEYhn3r2SBk3reLkT7jCBsNYE83_oGDTyK9sbhfj0a0ctE4EgVd0zfu_w-XivZ8N7vSd4T8SisOm5T1nPX6O15zwe2cwocV42inqTeTbxKSd08iHziI_U0HrWuPTcv_n5r-waemNj2T0iX3SB_tlq_so5PShU4wHT0YDfkOi9nnMY4vMI7-KsanOdFjPNwnP4_xwJSUNdsGlB-rcXx-vdmYgsG2xHjq0Icb_3PRbVIG8oqiwTrdxbw4zR4-5JfuyiMm_lQ-0qw56E6T9qQk46y3b5wXNepuKvP20OaBS9UIMNwM2oWThrNdGLao7cKwqW0Xhm1wy3zQOI8dPN27q-M7rxu-Cubz8C5eLoP5DVtFbBkt6Y0RRvLV1zqx77qpJIfF3L-Idm_zdviqtpS8eW-PLxcLPBUmyr7vB7fBrOb6pq7kavBPDoBfJ7ep2sFAyr2_fSkr9cxTOB48Ca1rruEhvpvPg5t8lSzjeRqmcKTJgizkwSKaT6dzGi0SmsXL7O5G0oRLvbKnvrDgr8SSgGfw_Y1YhdMwnC6nyzCYxcHsNplN03Q6Y_NkyYNokcFJhu_AJLcoB_7nxU21siIl9VbDosQ3usdFqu0vstyyA_q0NrmqVoZLqV5fwAAvkt9YAVZWgT8AmRw_pQ">