<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/61284>61284</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Missing optimization on aarch64 for types like `float32x4x2_t`
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          tuxzz
      </td>
    </tr>
</table>

<pre>
    Here is a simple code: [Godbolt](https://godbolt.org/z/n1vjfdnaj)

````c
#include <arm_neon.h>
#include <stddef.h>
#include <stdbool.h>

void simple_gemm(
  float* restrict out,
  float const* restrict a,
 float const* restrict b,
  size_t k, bool zero_out
) {
  register float32x4x2_t o0;
  o0.val[0] = vdupq_n_f32(0.0f);
  o0.val[1] = vdupq_n_f32(0.0f);

  // begin dot
  {
    register float32x4_t a0;
    register float32x4x2_t b0;

    while (k >= 1) {
      b0 = vld1q_f32_x2(b);
      a0 = vdupq_n_f32(a0[0]);

      o0.val[0] = vfmaq_f32(o0.val[0], a0, b0.val[0]);
      o0.val[1] = vfmaq_f32(o0.val[1], a0, b0.val[1]);

      b += 8;
      a0 += 1;
      k -= 1;
    }
  } // end dot

  // begin writeback
  {
    if (!zero_out) {
      register float32x4x2_t t0, t1, t2, t3, t4, t5;
      t0 = vld1q_f32_x2(out);
      
 o0.val[0] = vaddq_f32(o0.val[0], t0.val[0]);
      o0.val[1] = vaddq_f32(o0.val[1], t0.val[1]);
    }

    // TODO: both clang and gcc generates redundant mov because of bad register allocation.
 vst1q_f32_x2(out, o0);
  } // end writeback
}

````

The assembly generated:
````
simple_gemm:                            // @simple_gemm
        cbz     x3, .LBB0_6
        movi    v0.2d, #0000000000000000
        fmov    v3.4s, #1.00000000
        mov v1.16b, v0.16b
.LBB0_2:                                // =>This Inner Loop Header: Depth=1
        dup     v5.4s, v4.s[0]
        subs    x3, x3, #1
        ld1     { v6.4s, v7.4s }, [x2], #32
        fmla v1.4s, v6.4s, v4.s[0]
        fmla    v0.4s, v7.4s, v4.s[0]
 fadd    v4.4s, v5.4s, v3.4s
        b.ne    .LBB0_2
        tbnz    w4, #0, .LBB0_5
.LBB0_4:
        ld1     { v3.4s, v4.4s }, [x0]
 fadd    v1.4s, v3.4s, v1.4s
        fadd    v0.4s, v4.4s, v0.4s
.LBB0_5:
        mov     v2.16b, v0.16b
        st1     { v1.4s, v2.4s }, [x0]
        ret
.LBB0_6:
        movi    v1.2d, #0000000000000000
        mov     v0.16b, v1.16b
        tbz     w4, #0, .LBB0_4
        b       .LBB0_5
````

The two values of `float32x4x2_t o0` are assigned to `v1` and `v0`, resulting in a redundant `mov v2.16b, v0.16b` at `LBB0_6`. If they were assigned to a pair of neighboring registers in the correct order, such as `o0.val[0] -> v0` and `o0.val[1] -> v1`, the redundant `mov` could be eliminated.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJyUV1-Tm6AW_zTk5cw6CGqShzxsNs1tZ3qnL33PoGCkSyAVdHfz6e-AJlHjtr3MDricf79zfiegzFp51EJsULpF6W7BGleZeuOa98tlkRv-sfkqagHSAgMrT2cloDBcIPoMKN3-x_DcKIfSHSKryrmzRfQZkT0i-2Mnikx9RGR_QWSv4_ZXyTX7hcga4R3Cz_2c4dtf0W8RKnWhGi4A0RdWnw5aGB1ViH6ZU7COc1H-SZwbo4byMLdG8j6pw1GcToisOgFAqQxziDxDLayrZeHANA6Rl5EcCqPtWIvddT5TyQdurLyIg4NXRF7AQ4SLqM3Bh-rTWANabq_atThK60TduabkPXknBwcGI3rTMThqmULpFqN0B4juoOXN-fdBH0pKEFnhCJe-_jMW8b9ZXO06miEXR6mBG3fbvwOeg3xwwIaAP00rxw8hAd4qqQQgsnoFTybdQTwpkh857vJQPP7tszi8-0Tycd5-MDyTMMN9-eay9mOmxuWJ_e7NR1JPLMOB3vH2FMkMC3M-43mf8R_Q5oDI1jtczSTfSeKJ5BWeHrfRcneneHelX2h-J3-2Nd5q6UTOitfZBpGlZxOR-Nb5j3R-0iAu1MDFYSZhpmFOwpxOknKzTdFFnKj2zzM8M84_5dn93wzPeYun3qbcjrgYbHU1__lj98MfzrlxFRSK6SMwzeFYFHAUWtTMCQu14I3mTDs4mRZyUbDGCjAl5Izfq82UMgVz0uioD9Na91C7F3_-jOBNumPC_wT68PAf7v-sBDBrxSlXHzfk3N8un5gNz3Hal3x-9NhQgoc2A6YAivwS1vfQUNH37RYfsrHKybTSry2OCPdaiFA8GWOL0hfbW9Aosb1FHM3retU2juLM3xY-hn8KGh0Y8pcch3nSHaJfflbSwjetRQ3fjTnDV8G4qL2XnTi7CtFdPEbAm3NY27RH2yaRvTb3SNM2ub0Xq5t9amMtxeMO1XILbXb1uYwSG7rZ26Tbd9J3PyKUkmn5FPM16S2zv8EK-h1Bg2DzJiXjPOgmV5Vb1oGskeM80sKvVyZGQpfr0DpvybUn7h2UDhlMbr08WyF6xzqu0BzqeATWP8QPqG_KeOi5766r8hXnFFrfudCSuZa8NYIbJHDDRD5PoB-1cMPw2Vx42Sf6j7-1G2B8Axw_Anb9z3yOrGRCer-OmPzT4eXeDLRMNcL6gxVl-OGlLcPA6nDK-TdwDs54tTYOAs3DP8EzefFvj41yUh9BamCDAxxlOJwVU2K8jyDtS5rhCL6V4CrxAW9iEpbBmcna49RCHqvc1D7S9SKwPqar_It_XQv_Klz7o4O8gG2KCpj1YcZ35ROiX6DFg0zGt18nj_vkvO9pRt60MI3ikAsQSp6k9hdAtOAbytd0zRZiE2fLFU3TNFkuqs1yyQpG1qsEp0talgXFtFiRnJRrVohlmS_khmBCMcVrvE4JWUZkWbCY5SVjZZKyMkUJFicmVaRUe_LfLQtpbSM2WUxWyUKxXCgbPpMI0eINghARf2It6o23ecqbo0UJVtI6e_fipFNi819pra-qOTt5kpdwr4LRwFhdVFkCpanBfZyFBSVfxUPDoAwvmlptJh9Z0lVNHhXmhMjeB-yXp3NtfonCIbIPMC0i-5DG_wIAAP__U7Sqyg">