<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/61284>61284</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Missing optimization on aarch64 for types like `float32x4x2_t`
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
tuxzz
</td>
</tr>
</table>
<pre>
Here is a simple code: [Godbolt](https://godbolt.org/z/n1vjfdnaj)
````c
#include <arm_neon.h>
#include <stddef.h>
#include <stdbool.h>
void simple_gemm(
float* restrict out,
float const* restrict a,
float const* restrict b,
size_t k, bool zero_out
) {
register float32x4x2_t o0;
o0.val[0] = vdupq_n_f32(0.0f);
o0.val[1] = vdupq_n_f32(0.0f);
// begin dot
{
register float32x4_t a0;
register float32x4x2_t b0;
while (k >= 1) {
b0 = vld1q_f32_x2(b);
a0 = vdupq_n_f32(a0[0]);
o0.val[0] = vfmaq_f32(o0.val[0], a0, b0.val[0]);
o0.val[1] = vfmaq_f32(o0.val[1], a0, b0.val[1]);
b += 8;
a0 += 1;
k -= 1;
}
} // end dot
// begin writeback
{
if (!zero_out) {
register float32x4x2_t t0, t1, t2, t3, t4, t5;
t0 = vld1q_f32_x2(out);
o0.val[0] = vaddq_f32(o0.val[0], t0.val[0]);
o0.val[1] = vaddq_f32(o0.val[1], t0.val[1]);
}
// TODO: both clang and gcc generates redundant mov because of bad register allocation.
vst1q_f32_x2(out, o0);
} // end writeback
}
````
The assembly generated:
````
simple_gemm: // @simple_gemm
cbz x3, .LBB0_6
movi v0.2d, #0000000000000000
fmov v3.4s, #1.00000000
mov v1.16b, v0.16b
.LBB0_2: // =>This Inner Loop Header: Depth=1
dup v5.4s, v4.s[0]
subs x3, x3, #1
ld1 { v6.4s, v7.4s }, [x2], #32
fmla v1.4s, v6.4s, v4.s[0]
fmla v0.4s, v7.4s, v4.s[0]
fadd v4.4s, v5.4s, v3.4s
b.ne .LBB0_2
tbnz w4, #0, .LBB0_5
.LBB0_4:
ld1 { v3.4s, v4.4s }, [x0]
fadd v1.4s, v3.4s, v1.4s
fadd v0.4s, v4.4s, v0.4s
.LBB0_5:
mov v2.16b, v0.16b
st1 { v1.4s, v2.4s }, [x0]
ret
.LBB0_6:
movi v1.2d, #0000000000000000
mov v0.16b, v1.16b
tbz w4, #0, .LBB0_4
b .LBB0_5
````
The two values of `float32x4x2_t o0` are assigned to `v1` and `v0`, resulting in a redundant `mov v2.16b, v0.16b` at `LBB0_6`. If they were assigned to a pair of neighboring registers in the correct order, such as `o0.val[0] -> v0` and `o0.val[1] -> v1`, the redundant `mov` could be eliminated.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJyUV1-Tm6AW_zTk5cw6CGqShzxsNs1tZ3qnL33PoGCkSyAVdHfz6e-AJlHjtr3MDricf79zfiegzFp51EJsULpF6W7BGleZeuOa98tlkRv-sfkqagHSAgMrT2cloDBcIPoMKN3-x_DcKIfSHSKryrmzRfQZkT0i-2Mnikx9RGR_QWSv4_ZXyTX7hcga4R3Cz_2c4dtf0W8RKnWhGi4A0RdWnw5aGB1ViH6ZU7COc1H-SZwbo4byMLdG8j6pw1GcToisOgFAqQxziDxDLayrZeHANA6Rl5EcCqPtWIvddT5TyQdurLyIg4NXRF7AQ4SLqM3Bh-rTWANabq_atThK60TduabkPXknBwcGI3rTMThqmULpFqN0B4juoOXN-fdBH0pKEFnhCJe-_jMW8b9ZXO06miEXR6mBG3fbvwOeg3xwwIaAP00rxw8hAd4qqQQgsnoFTybdQTwpkh857vJQPP7tszi8-0Tycd5-MDyTMMN9-eay9mOmxuWJ_e7NR1JPLMOB3vH2FMkMC3M-43mf8R_Q5oDI1jtczSTfSeKJ5BWeHrfRcneneHelX2h-J3-2Nd5q6UTOitfZBpGlZxOR-Nb5j3R-0iAu1MDFYSZhpmFOwpxOknKzTdFFnKj2zzM8M84_5dn93wzPeYun3qbcjrgYbHU1__lj98MfzrlxFRSK6SMwzeFYFHAUWtTMCQu14I3mTDs4mRZyUbDGCjAl5Izfq82UMgVz0uioD9Na91C7F3_-jOBNumPC_wT68PAf7v-sBDBrxSlXHzfk3N8un5gNz3Hal3x-9NhQgoc2A6YAivwS1vfQUNH37RYfsrHKybTSry2OCPdaiFA8GWOL0hfbW9Aosb1FHM3retU2juLM3xY-hn8KGh0Y8pcch3nSHaJfflbSwjetRQ3fjTnDV8G4qL2XnTi7CtFdPEbAm3NY27RH2yaRvTb3SNM2ub0Xq5t9amMtxeMO1XILbXb1uYwSG7rZ26Tbd9J3PyKUkmn5FPM16S2zv8EK-h1Bg2DzJiXjPOgmV5Vb1oGskeM80sKvVyZGQpfr0DpvybUn7h2UDhlMbr08WyF6xzqu0BzqeATWP8QPqG_KeOi5766r8hXnFFrfudCSuZa8NYIbJHDDRD5PoB-1cMPw2Vx42Sf6j7-1G2B8Axw_Anb9z3yOrGRCer-OmPzT4eXeDLRMNcL6gxVl-OGlLcPA6nDK-TdwDs54tTYOAs3DP8EzefFvj41yUh9BamCDAxxlOJwVU2K8jyDtS5rhCL6V4CrxAW9iEpbBmcna49RCHqvc1D7S9SKwPqar_It_XQv_Klz7o4O8gG2KCpj1YcZ35ROiX6DFg0zGt18nj_vkvO9pRt60MI3ikAsQSp6k9hdAtOAbytd0zRZiE2fLFU3TNFkuqs1yyQpG1qsEp0talgXFtFiRnJRrVohlmS_khmBCMcVrvE4JWUZkWbCY5SVjZZKyMkUJFicmVaRUe_LfLQtpbSM2WUxWyUKxXCgbPpMI0eINghARf2It6o23ecqbo0UJVtI6e_fipFNi819pra-qOTt5kpdwr4LRwFhdVFkCpanBfZyFBSVfxUPDoAwvmlptJh9Z0lVNHhXmhMjeB-yXp3NtfonCIbIPMC0i-5DG_wIAAP__U7Sqyg">