<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/71512>71512</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [AArch64] suboptimal vectorisation (tsvc, s128)
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            backend:AArch64,
            new issue,
            vectorization
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          sjoerdmeijer
      </td>
    </tr>
</table>

<pre>
    Clang tip of tree is 120% behind GCC for kernel s128 in TSVC. Compile this input with `-O3 -ffast-math -mcpu=neoverse-v2`:

```
__attribute__((aligned(64))) float x[32000];

__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
 aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s128(struct args_t * func_args)
{
    int j, k;
    for (int nl = 0; nl < 2*100000; nl++) {
        j = -1;
        for (int i = 0; i < 32000/2; i++) {
            k = j + 1;
            a[i] = b[k] - d[i];
            j = k + 1;
            b[k] = a[i] + c[k];
 }
        dummy(a, b, c, d, e, aa, bb, cc, 1.);
    } 
}
```
GCC's codegen:

```
.L3:
        ld2     {v28.4s - v29.4s}, [x0]
        mov     x6, x0
        add     x5, x0, 16
 add     x4, x0, 24
        add     x0, x0, 32
        ldr     q27, [x8], 16
        ld2     {v30.4s - v31.4s}, [x7], 32
        fsub v28.4s, v28.4s, v27.4s
        fadd    v30.4s, v28.4s, v30.4s
 str     q28, [x9], 16
        str     s30, [x6], 8
        st1 {v30.s}[1], [x6]
        st1     {v30.s}[2], [x5]
        st1 {v30.s}[3], [x4]
        cmp     x7, x23
        bne     .L3
``` 

Clang's codegen:

```
.LBB0_2: //   Parent Loop BB0_1 Depth=1
        mov     z2.d, z0.d
        add z2.d, z2.d, #1                  // =0x1
        adr     z3.d, [z7.d, z2.d, lsl #2]
        fmov    x9, d3
        ld2w    { z3.s, z4.s }, p0/z, [x9]
        ld1w    { z5.s }, p0/z, [x21, x8, lsl #2]
 fmov    x9, d2
        fsub    z3.s, z3.s, z5.s
        st1w    { z3.s }, p0, [x22, x8, lsl #2]
        add     x8, x8, x28
        ld2w    { z4.s, z5.s }, p0/z, [x20, x9, lsl #2]
        add     x9, x19, #4
 cmp     x23, x8
        fadd    z2.s, z4.s, z3.s
        uunpklo z3.d, z2.s
        uunpkhi z2.d, z2.s
        st1w    { z3.d }, p1, [x9, z0.d, lsl #2]
        add     z0.d, z0.d, z6.d
        st1w    { z2.d }, p1, [x9, z1.d, lsl #2]
        add     z1.d, z1.d, z6.d
 b.ne    .LBB0_2
 ```

See also:

https://godbolt.org/z/154McGMve

Todo: root cause analysis
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy0V02v2joT_jVmMyKyxwlJFixOiOim1ftKre4WOYkBH0LCjR1K-fVXdj4gwDlqdXUjZMfz8cyM_XgAobXaVVIuSZCQIJ2J1uzrZqnfa9kUR6neZTPL6uLXclWKagdGnaDegmmkBKWBISUYQCb3qirgy2oF27qBg2wqWYJmGIGq4Mf3v1YerOrjSZUSzF5pUNWpNfBTmT2QBZ3_j8N8uxXazI_C7GF-zE8t4Wkl67NstJyfkSwo4W-EpoQO44L2H7fcbIQxjcpaIzcbghHBSJS2soJgtPAJxt0HtmUtDFxIkHCklJIgJTy5B_5DJHGPhKtsusyny2K6lNNlFx-EhcRgYYXjC66y7KU4z1-KjXkhnhaqKgNFezz-Ihi5WqbZwH8le072X6kwfqirOxfLPoKRNk2bGxDNTm8MEHyDbVvlG7u2np1b2PsDANhdebfohxHWii2vCUZWW5VAeAqU8KR7XwESfGPUPp2QYOI-MUyw7fPunOdsgv4QQd0CKIffbSOu0Yk-QbfPwXm_A8EEnsPYxxJMkSB1hpZWB7uYQ9HLXzp1iR8-gR2RrOEtBiaQ95qbGwnTKcDARGH3PrNDbofCDtIOotN0Kqdj3v3hWxASpjCcafqyT3xZrQiGGvK6kDtZfd5UvK98NBjyLAuELlRyxsjzNczhjLHnaxsSV0CC5OIoP3E71mc3XxbW5kKnWlEUnTbotba6xdAPBqV_U6L_AQC92XB8zLxx898YDnlG_U0aY70oktO-SM6mRYa982OcrW4z6PbGqu_fQvs2Ne4z78I82Heyzl6bIftoSCD-IPvBVHM6mA4tI3q0ZEONrrIgYb3h6PRkf7cvvQ_e-QQvfab2_M7ef7LPj6fuKN0pXZBP1Vkl3Wy5OeUs3DPZfV3_CdOThG6Q8DcguCa4BoD_i0ZWBr7W9QmslkEqT2ZPeMpec_uKnruuV-oVz_Qctf1MkDN4evrohKf0wh5BumO98h4gSK7hFLPUpcXFp03d9jleYtdU-BPdf_bHatEd-a6-p6Fn-8n23uuEdg_-7OYffOSHzB1o9DrRhwxf3amu9i67YQ48_US3SS33ufR54Gd53J2YSye6GV8w-mTf_FtGH9Tf9ab49-I6swuLe6oM_W68Hcj7xF62kyveTnHcr4ll21anQ1mPbHIezxZ7dU_cz_e6GOtmN64M9-F3ih4Mx3nxeJEmEfHjiOx3I_aG43yLmHldpxlaQ_-9Pe0b3fhdShClrh96zN6Yk7Yyd6d3dZHVpfHqZtexYs0C_1v-5dtZ3jv9qAuLA01dG8hFqyWISpS_tNKzYsmLmMdiJpdsEcc88EPfn-2XURFugxC32YJhFG9zyUSGkaRUSBFlLJqpJVLkjNGQ0SCmvhdlzN8Gki2ySAjMM-JTeRSq9MryfLQpzpTWrVyGLGA4K0UmS-3-IiFmIj_IqiD87e2tyff2H4G9UASxkj_BuY2Ss8xN3airMKqurDRIZ83Sxphn7U4Tn5ZKG32LapQp3X-xATtIQbdZfTLqKEro8bTDs78WjT67n0Ldr9141jbl8mHbldm3mZfXR4JrG6ef5qemfpe5Ibh2OWuCa1ftPwEAAP__vZh2SQ">