<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/71520>71520</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [AArch64] Missed fmla vectorisation opportunity (tsvc, s2275)
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            backend:AArch64,
            vectorization
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          sjoerdmeijer
      </td>
    </tr>
</table>

<pre>
    We are a lot behind (300%) for kernel s2275 in TSVC compared to GCC12.

Compile this input with {{}}:

```
__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
 aa[256][256],bb[256][256],cc[256][256],tt[256][256];

int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);

float s2275(struct args_t * func_args)
{
    for (int nl = 0; nl < 100*(100000/256); nl++) {
        for (int i = 0; i < 256; i++) {
 for (int j = 0; j < 256; j++) {
                aa[j][i] = aa[j][i] + bb[j][i] * cc[j][i];
            }
            a[i] = b[i] + c[i] * d[i];
        }
        dummy(a, b, c, d, e, aa, bb, cc, 0.);
 }
}
```

Clang's codegen:

```
.LBB0_2: //   Parent Loop BB0_1 Depth=1
        mov     x12, xzr
.LBB0_3:                                //   Parent Loop BB0_1 Depth=1
        add     x14, x10, x12
        add     x13, x9, x12
 ldr     s2, [x14]
        ldr     s3, [x14, #1024]
        add x14, x11, x12
        ldr     s0, [x13]
        ldr     s1, [x13, #1024]
        add     x12, x12, #2048
        ldr     s4, [x14]
 ldr     s5, [x14, #1024]
        cmp     x12, #64, lsl #12 // =262144
        fmadd   s0, s4, s2, s0
        fmadd   s1, s5, s3, s1
        str     s0, [x13]
        str     s1, [x13, #1024]
 b.ne    .LBB0_3
        ldr     s0, [x22, x8, lsl #2]
        ldr s1, [x23, x8, lsl #2]
        ldr     s2, [x24, x8, lsl #2]
        add x11, x11, #4
        add     x10, x10, #4
        add     x9, x9, #4
        fmadd   s0, s2, s1, s0
        str     s0, [x25, x8, lsl #2]
        add     x8, x8, #1
        cmp     x8, #256
        b.ne .LBB0_2
```

vs. GCC's codegen:

```
.L6:
        mov x0, 0
.L3:
        ldr     q29, [x10, x0]
        ldr     q30, [x9, x0]
        ldr     q31, [x8, x0]
        fmla    v31.4s, v29.4s, v30.4s
        str     q31, [x8, x0]
        add     x0, x0, 1024
 cmp     x0, 262144
        bne     .L3
        ldr     q29, [x27, x11]
        add     x8, x8, 16
        add     x10, x10, 16
 add     x9, x9, 16
        ldr     q30, [x26, x11]
        ldr     q31, [x25, x11]
        fmla    v31.4s, v29.4s, v30.4s
        str     q31, [x19, x11]
        add     x11, x11, 16
        cmp     x11, 1024
 bne     .L6
```

See also:
https://godbolt.org/z/8E3fexn5o

TODO:
Root cause analysis.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy0V12zorgW_TXxZVdbyQ4oPPjgx_W-9K2-Nd0182gFiBobiU2Cc07_-qkEQVA82jUzFgUk2Vn7Y61sVRijdoWUMxIuSLgaicrudTkzBy3L7CjVQZajRGfvsz8kiFKCgFxbSOReFRkQjDilBEOCMWx1Cd9lWcgcDOI0BFXAt6-_LyHVx5MoZQZWw3-XS4ZjQleEzuv7Uh9PKpdg98qAKk6VhT-V3QOZLvy1chefd7eQCb1cfrjZCGtLlVRWbjYEI4KRyF1SGcFoEhCM6wu2uRYWBAkXHCmlJFwRXCb9YdofZv2h7A9r_yAcJIYTN9m-4DJJBqfTdHDa2oFpvujmrQoLWXU8vhOMfC79aODfmrsP9m8tYXyTV82LFw3ByNiySi2Icmc2FgjOYVsV6caN3dZ63_QCAABeeAQjV50iB8JXQAlf1O9LYE6gc4IRo-5DcO3i8TFAkRNc-CuGHuQNrLqiKg_qINxgaHtn3-G679Ddd_jAbfPxojrUFVQkXHmo-0lcgNdZf3IOXmWdybbkXR_uaN1Niq7DpOso7TrIHgHfgTaSFU4Cibul7pa5m3Q3Ua_US36NjrsquUJeX_ot4NJKclHsCE4NpDqTO1l83DfGnxcLukHC50BwTXANAP8XpSwsfNb6BG6VwUqe7J7wFesnddRn_3xj6AJ--1l2QbkDffL5dZ8iyy4-A--T0fqBj8y4X4_7VnlW-mXjAyfhwsGFN5y1Rrxj5N6QM4r39s5nGxYbDKuFpC0kf-yXdYye-O3SUD8IcqRB9AA6GMq7XQ1fSzg9nrqOCfKJt89N7vdgwy_hK5wgC4Kb7nKsQ6-rUcdUM2LoA0tfkjq-mhZzow9jX6hwa_Sswsm4kM6wUfQTMrEmIOoUAQfpvTpG_uKWvlwxeGFbLUjWESRBHjwSD-2cpw_s4ut5ure6YRQvDA0wes8Thi-m5MOIrsaOtQfCbNbdd07PwhPbNL_H3fRsxu7n2q_000lr0PhybfLNp9ka8TujhuIfGLearBmhD-Xwg7fFi5-atoqLHphuj7lwzzNn48A4ozPGzRun7m2QwRegW9aajND9JsFGPC1dfmGoUyT1MQRXuWdlw2kj-FfEwyavnIfWaugY3EIM0IOTRzENEHQ5CAPG_wBFLH5anV7HuE3u2vPZDYtXjiYfnKivUoLIjW4PwN7ak3Ej_1Wx01miczvW5Y7g-ifBdfQfvpVvRai7KN--rL60CL9pbSEVlZEgCpG_G2XGo2zGs5jHYiRnbBLHPAyndDraz2jCQwxklHARxJhOOE-nQRAkIhIYZZyN1AwpcsbolDEW0ul4S4Mg3SZJGiWZYBiSgMqjUPk4z89HF-hIGVPJ2ZSFSEe5SGRu_D9JxESk32WRET6fz8t0774dXUskiGeZWl2qn8IqXbjZcDUqZw7xU1LtDAlorow1Vx9W2dz_QW2QwhX8Txkjs1oVF0DjAUGfTrq0VaHsu_sdbs05rRuy_3sRj6oyn90UXtl9lYxTfSS4dl4vj0-nUh9kagmufZqG4Npn-lcAAAD__0b5pwc">