<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/57661>57661</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            Suboptimal x86 codegen with multiple stores to neighboring addresses

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          tellowkrinkle

      </td>

    </tr>

</table>

<pre>

    Godbolt link: https://gcc.godbolt.org/z/7vcboTPW1

First, in test0, clang decides to calculate `i * 4 + 1`, `i * 4 + 2`, etc before doing the `<< 4` to index the `uint4` array, requiring the full calculation for all four stores.  I'd expect it to first calculate `t0 = (i * 4) << 4`, then index with `t0 + 16`, `t0 + 32`, etc.  This seems to be a missing optimization relating to integer overflow being UB, as changing `i` to an `unsigned long` fixes this codegen.

Second, in test1, clang notices that the index doesn't change in the inner loop, and hoists the calculation out to the outer loop.  It somehow fails to include the final `<< 4` in this hoisting, resulting in the majority of the actual calculation still happening in the inner loop.  This happens regardless of whether `i` is an `unsigned long` or `int`.

In addition, it hoists the four offsets separately, even though they're just constant offsets from each other.  This makes it run out of registers, spilling some to the stack, while it could have just hoisted a single `t0 = (i * 4) << 4` out of the loop and indexed with `[dst + t0]`, `[dst + t0 + 16]`, etc, freeing up 3 registers for other things (gcc is guilty of this one too).

<details>

<summary>Full input and output</summary>

test0:

```cpp

typedef unsigned int uint4 __attribute__((vector_size(16)));

typedef unsigned int uint4_packed __attribute__((vector_size(16), aligned(1)));

static void sw64(uint4& a, uint4& b, uint4& c, uint4& d) {

    uint4 t0 = __builtin_shufflevector(a, b, 0, 1, 4, 5);

    uint4 t1 = __builtin_shufflevector(c, d, 0, 1, 4, 5);

    uint4 t2 = __builtin_shufflevector(a, b, 2, 3, 6, 7);

    uint4 t3 = __builtin_shufflevector(c, d, 2, 3, 6, 7);

    a = t0;

    b = t1;

    c = t2;

    d = t3;

}

void test0(uint4* __restrict__ dst, const char* __restrict__ src, int srcpitch, int i) {

    uint4 v0 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 0]);

    uint4 v1 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 1]);

    uint4 v2 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 2]);

    uint4 v3 = *reinterpret_cast<const uint4_packed*>(&src[srcpitch * 3]);

    sw64(v0, v1, v2, v3);

    dst[i * 4 + 0] = v0;

    dst[i * 4 + 1] = v1;

    dst[i * 4 + 2] = v2;

    dst[i * 4 + 3] = v3;

}

```

```asm

test0(unsigned int __vector(4)*, char const*, int, int):                    # @test0(unsigned int __vector(4)*, char const*, int, int)

        movups  xmm0, xmmword ptr [rsi]

        movsxd  rax, edx

        movups  xmm1, xmmword ptr [rsi + rax]

        movups  xmm2, xmmword ptr [rsi + 2*rax]

        lea     rax, [rax + 2*rax]

        movups  xmm3, xmmword ptr [rsi + rax]

        movaps  xmm4, xmm0

        movlhps xmm4, xmm1                      # xmm4 = xmm4[0],xmm1[0]

        movaps  xmm5, xmm2

        movlhps xmm5, xmm3                      # xmm5 = xmm5[0],xmm3[0]

        unpckhpd        xmm0, xmm1                      # xmm0 = xmm0[1],xmm1[1]

        unpckhpd        xmm2, xmm3                      # xmm2 = xmm2[1],xmm3[1]

        lea     eax, [4*rcx]

        cdqe

        shl     rax, 4

        movaps  xmmword ptr [rdi + rax], xmm4

        lea     eax, [4*rcx + 1]

        cdqe

        shl     rax, 4

        movaps  xmmword ptr [rdi + rax], xmm5

        lea     eax, [4*rcx + 2]

        cdqe

        shl     rax, 4

        movaps  xmmword ptr [rdi + rax], xmm0

        lea     eax, [4*rcx + 3]

        cdqe

        shl     rax, 4

        movaps  xmmword ptr [rdi + rax], xmm2

        ret

```

test1:

```cpp

void test1(char* __restrict__ dst, const char* __restrict__ src, int srcpitch, int width, int height) {

    for (int y = 0; y < height; y++) {

        for (int x = 0; x < width; x++) {

            uint4* vdst = reinterpret_cast<uint4*>(&dst[((y >> 4) << 8) + (x << 4)]);

            test0(vdst, src, srcpitch, (y >> 2) & 3);

            src += srcpitch * 4;

        }

    }

}

```

```asm

test1(char*, char const*, int, int, int):                       # @test1(char*, char const*, int, int, int)

        push    rbp

        push    r15

        push    r14

        push    r13

        push    r12

        push    rbx

        mov     dword ptr [rsp - 12], r8d       # 4-byte Spill

        test    r8d, r8d

        jle     .LBB1_6

        test    ecx, ecx

        jle     .LBB1_6

        movsxd  rax, edx

        shl     edx, 2

        lea     r9, [rax + 2*rax]

        movsxd  r15, edx

        mov     ecx, ecx

        mov     qword ptr [rsp - 8], rcx        # 8-byte Spill

        xor     r11d, r11d

        xor     r14d, r14d

.LBB1_3:                                # =>This Loop Header: Depth=1

        mov     ebx, r11d

        and     ebx, -256

        mov     r8d, r14d

        and     r8d, 12

        lea     r12d, [r8 + 1]

        lea     r13d, [r8 + 2]

        mov     ecx, r8d

        or      ecx, 3

        mov     rdx, qword ptr [rsp - 8]        # 8-byte Reload

.LBB1_4:                                #   Parent Loop BB1_3 Depth=1

        movsxd  rbx, ebx

        lea     r10, [rdi + rbx]

        movups  xmm0, xmmword ptr [rsi]

        movups  xmm1, xmmword ptr [rsi + rax]

        movups  xmm2, xmmword ptr [rsi + 2*rax]

        movups  xmm3, xmmword ptr [rsi + r9]

        movaps  xmm4, xmm0

        movlhps xmm4, xmm1                      # xmm4 = xmm4[0],xmm1[0]

        movaps  xmm5, xmm2

        movlhps xmm5, xmm3                      # xmm5 = xmm5[0],xmm3[0]

        unpckhpd        xmm0, xmm1                      # xmm0 = xmm0[1],xmm1[1]

        unpckhpd        xmm2, xmm3                      # xmm2 = xmm2[1],xmm3[1]

        mov     rbp, r8

        shl     rbp, 4

        movaps  xmmword ptr [rbp + r10], xmm4

        mov     rbp, r12

        shl     rbp, 4

        movaps  xmmword ptr [rbp + r10], xmm5

        mov     rbp, r13

        shl     rbp, 4

        movaps  xmmword ptr [rbp + r10], xmm0

        mov     rbp, rcx

        shl     rbp, 4

        movaps  xmmword ptr [rbp + r10], xmm2

        add     rsi, r15

        add     ebx, 16

        dec     rdx

        jne     .LBB1_4

        inc     r14d

        add     r11d, 16

        cmp     r14d, dword ptr [rsp - 12]      # 4-byte Folded Reload

        jne     .LBB1_3

.LBB1_6:

        pop     rbx

        pop     r12

        pop     r13

        pop     r14

        pop     r15

        pop     rbp

        ret

```

</details>

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJztGl1zqzb21zgvmpsxYBz7wQ9x0ux2pg-dve3so0cgYZTIiIuE49xf33MkhAGb2NmmtzOd9TiApPP9pSOTRLG31b8US5Q0RIriZRLdk9yYUsPDJHyC7zZNb7cO4lZVW5j5Dn93-zRRv_3632AyfZxM7931SVTaTMIHIgpiuDZTfE4lLbaE8VQwrolRJKUyrSU1nEzmU0Em4T2ZwXVNAhgjxmA6bKa5SUnCM1VxwpQAmia3JCbRA3zJDB6RvCgYP_i1WhTGLtCqom9IpeLfalF59KyWshVIqIIAeUJhLlN1RbQBZvqWkJ8n4R0j_FDy1BBhkE2GuvZ1MVMyiR5B5kUj_iRcko50yB6YFo2Ir8LkHg21nx_Vb6aijuogxW-50ERzvrNmTDihZCe0Rl1UacROfHc6VByVQQ3RGoZveUXUnleZVK-Ahiu_r5Es1STNwTs4g0ZvLEgLa7pCi23BGZGq2OJKJg7oQBQiVQyoFrdd53_lqSpYx_vB0fuFMiK1yNRYuzsLMMV1AaY1TgxuMe1qASJLpUorZcFIroQ22i52vaVq6wuchscGB_1liFY7noO-GRVSO0uksmbcuV0UVA5jxzIH5SwvMImLFl1La8tGtB19VpUwb0RldkxTU9N-CAEyBFBOy5IXHcyjUt6VDkQDky2tmORaI9XXnAN41ToEAEccohxQYeDac8XPBaGMCZTG-sN07WcjW2WZ5gajqaQVhK-0ucH3HGVV9TZHUJi7g2R7rjHQVaENLUyLmVVqRzhNc6JQXK_Tjr6An4FjVTv3gEagH3DnlUYeugTroFnQQd55QDp9wdXXXEiO6KmqJbid7hv-VgHQnRKMd3lVvnn-yAHtbiPJRh4Q8tk3idcM6GO6Qb2KH49J2FvxKdoCQEriLau4Tai6JNFRUVtGrF0wpIqtRiGhkKIzt7WQPn5gqAq0ggLRvQujB8YNRu0k-qmd0vVuR6s3mHrCkiWKEnRDfUBHeEStw6cjUDcaXCWOmhGKb79pWTbrbyVnPCNtfEFEEVs4yWZDjalEArm12YAK8N1DCVTVRovvHEZgkXDZfKP1JXqbEtwME9eShdyXlgROnTJyV4gdKC5krwQj-nUOQbBwVT-cE4o02lHSG6W9EbOxc9dQJvBxFmhibLNJ0G2i2Oi8zjLJnbzAy7KwlG1Y2KI3w0vcFbVDMLhA0MrFPkAwvFrCEC8RXuZ4uRshGF0t4SWC1FLC6OtMJm4y6E2mbjLsTTI3GR09fvfYdb31edNoeKffg-BQtSG8UrPZEOYaElu-cJepTiB0lbpNy-BjKUya-7EYCYq9Lzz3FccNtiorbjYp1ZiHjlU33gEOUxLjfI7c4rVnZMuWrTpnPbEPPpNPMM4n_Ew-4Tif6DP5RGf4NAVgb1Nnb3Nnb4N0Hw1BMTLidbfRREdYAffTC5BBCxlcgAxbyPACZNRCnot3X7UHQ6p33SIPWdAtuptNm7MzWzvvbS5AFriEaCawg_C3JR4AznwmIfhuNv08Nq0l8LNT-7rUhBx2O-s5uL-qipHSQIsTryst0NdDFH1ghFT0YLdjdhglGZwnaa2O6GdIe9TwHdQQ4_gMuuTU3hvJEIMe3sXoMIw-LittUGcN6vQEQuYA0oEIznnY-RihbBRa8HjtitODtWIzGuMfN9TDUf4eInqXf-z5xz3-0Vn-dVGmL3nJ_LgTQe9rOfVcsM8LeloG13AJr9Il9FzCHpfoLBcfOLwNHNzPqvTU6yn7xvszOpf23gTdbNRLvcBivcByCs2uFKothD9OtPgjooU_VLTpR0SLfqhog3yELffsnuKO7OPHhLbZCrAFPNdJ_ale61Uw0w5yLra5Oem-8FSFpz2AeLOJhfu0fXzwKDgGC9jvAHtA4XCkcLAUnAQ4fIcAftpOc2_Ph0DlTDvjgdoOxu347rSDIv8E396RdWGfwXsAcDieY2GvPO1z_Mfvx_vG9o2Bu8btsgsdizk56YbasKtSlAGV6jVbsxPotjfpDT7WsXRC6WLLcKFBIb0e5X8i3FOvrHVu0yUpRxaCeGxhNrYQjS2EY8xPuxp7Z_3-oCRfSBA2KV8t_D6FBpl9Sd4MJ1_xZ5c-LbSTZbJgDVp__Vlye7_9Zb0ONvPzyDx17Vd6-ADypd7N1z5csKfM8_3V8vr2yrEL4rFWkbyni1__dmr0hbc5VPaO0RejRj9AAXIuD5zV8T4CMWsgZg2Es2U0ngDtx2ZC9Agpb3-U-wV__Po3p4xXiPzISyx0j8GIGZLDecnw56YOwJcwPvWrE30xkHxIoQEYhn3r2SBk3reLkT7jCBsNYE83_oGDTyK9sbhfj0a0ctE4EgVd0zfu_w-XivZ8N7vSd4T8SisOm5T1nPX6O15zwe2cwocV42inqTeTbxKSd08iHziI_U0HrWuPTcv_n5r-waemNj2T0iX3SB_tlq_so5PShU4wHT0YDfkOi9nnMY4vMI7-KsanOdFjPNwnP4_xwJSUNdsGlB-rcXx-vdmYgsG2xHjq0Icb_3PRbVIG8oqiwTrdxbw4zR4-5JfuyiMm_lQ-0qw56E6T9qQk46y3b5wXNepuKvP20OaBS9UIMNwM2oWThrNdGLao7cKwqW0Xhm1wy3zQOI8dPN27q-M7rxu-Cubz8C5eLoP5DVtFbBkt6Y0RRvLV1zqx77qpJIfF3L-Idm_zdviqtpS8eW-PLxcLPBUmyr7vB7fBrOb6pq7kavBPDoBfJ7ep2sFAyr2_fSkr9cxTOB48Ca1rruEhvpvPg5t8lSzjeRqmcKTJgizkwSKaT6dzGi0SmsXL7O5G0oRLvbKnvrDgr8SSgGfw_Y1YhdMwnC6nyzCYxcHsNplN03Q6Y_NkyYNokcFJhu_AJLcoB_7nxU21siIl9VbDosQ3usdFqu0vstyyA_q0NrmqVoZLqV5fwAAvkt9YAVZWgT8AmRw_pQ">