<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/71512>71512</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AArch64] suboptimal vectorisation (tsvc, s128)
</td>
</tr>
<tr>
<th>Labels</th>
<td>
backend:AArch64,
new issue,
vectorization
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
sjoerdmeijer
</td>
</tr>
</table>
<pre>
Clang tip of tree is 120% behind GCC for kernel s128 in TSVC. Compile this input with `-O3 -ffast-math -mcpu=neoverse-v2`:
```
__attribute__((aligned(64))) float x[32000];
__attribute__((aligned(64))) float a[32000],b[32000],c[32000],d[32000],e[32000],
aa[256][256],bb[256][256],cc[256][256],tt[256][256];
int dummy(float[32000], float[32000], float[32000], float[32000], float[32000], float[256][256], float[256][256], float[256][256], float);
float s128(struct args_t * func_args)
{
int j, k;
for (int nl = 0; nl < 2*100000; nl++) {
j = -1;
for (int i = 0; i < 32000/2; i++) {
k = j + 1;
a[i] = b[k] - d[i];
j = k + 1;
b[k] = a[i] + c[k];
}
dummy(a, b, c, d, e, aa, bb, cc, 1.);
}
}
```
GCC's codegen:
```
.L3:
ld2 {v28.4s - v29.4s}, [x0]
mov x6, x0
add x5, x0, 16
add x4, x0, 24
add x0, x0, 32
ldr q27, [x8], 16
ld2 {v30.4s - v31.4s}, [x7], 32
fsub v28.4s, v28.4s, v27.4s
fadd v30.4s, v28.4s, v30.4s
str q28, [x9], 16
str s30, [x6], 8
st1 {v30.s}[1], [x6]
st1 {v30.s}[2], [x5]
st1 {v30.s}[3], [x4]
cmp x7, x23
bne .L3
```
Clang's codegen:
```
.LBB0_2: // Parent Loop BB0_1 Depth=1
mov z2.d, z0.d
add z2.d, z2.d, #1 // =0x1
adr z3.d, [z7.d, z2.d, lsl #2]
fmov x9, d3
ld2w { z3.s, z4.s }, p0/z, [x9]
ld1w { z5.s }, p0/z, [x21, x8, lsl #2]
fmov x9, d2
fsub z3.s, z3.s, z5.s
st1w { z3.s }, p0, [x22, x8, lsl #2]
add x8, x8, x28
ld2w { z4.s, z5.s }, p0/z, [x20, x9, lsl #2]
add x9, x19, #4
cmp x23, x8
fadd z2.s, z4.s, z3.s
uunpklo z3.d, z2.s
uunpkhi z2.d, z2.s
st1w { z3.d }, p1, [x9, z0.d, lsl #2]
add z0.d, z0.d, z6.d
st1w { z2.d }, p1, [x9, z1.d, lsl #2]
add z1.d, z1.d, z6.d
b.ne .LBB0_2
```
See also:
https://godbolt.org/z/154McGMve
Todo: root cause analysis
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJy0V02v2joT_jVmMyKyxwlJFixOiOim1ftKre4WOYkBH0LCjR1K-fVXdj4gwDlqdXUjZMfz8cyM_XgAobXaVVIuSZCQIJ2J1uzrZqnfa9kUR6neZTPL6uLXclWKagdGnaDegmmkBKWBISUYQCb3qirgy2oF27qBg2wqWYJmGIGq4Mf3v1YerOrjSZUSzF5pUNWpNfBTmT2QBZ3_j8N8uxXazI_C7GF-zE8t4Wkl67NstJyfkSwo4W-EpoQO44L2H7fcbIQxjcpaIzcbghHBSJS2soJgtPAJxt0HtmUtDFxIkHCklJIgJTy5B_5DJHGPhKtsusyny2K6lNNlFx-EhcRgYYXjC66y7KU4z1-KjXkhnhaqKgNFezz-Ihi5WqbZwH8le072X6kwfqirOxfLPoKRNk2bGxDNTm8MEHyDbVvlG7u2np1b2PsDANhdebfohxHWii2vCUZWW5VAeAqU8KR7XwESfGPUPp2QYOI-MUyw7fPunOdsgv4QQd0CKIffbSOu0Yk-QbfPwXm_A8EEnsPYxxJMkSB1hpZWB7uYQ9HLXzp1iR8-gR2RrOEtBiaQ95qbGwnTKcDARGH3PrNDbofCDtIOotN0Kqdj3v3hWxASpjCcafqyT3xZrQiGGvK6kDtZfd5UvK98NBjyLAuELlRyxsjzNczhjLHnaxsSV0CC5OIoP3E71mc3XxbW5kKnWlEUnTbotba6xdAPBqV_U6L_AQC92XB8zLxx898YDnlG_U0aY70oktO-SM6mRYa982OcrW4z6PbGqu_fQvs2Ne4z78I82Heyzl6bIftoSCD-IPvBVHM6mA4tI3q0ZEONrrIgYb3h6PRkf7cvvQ_e-QQvfab2_M7ef7LPj6fuKN0pXZBP1Vkl3Wy5OeUs3DPZfV3_CdOThG6Q8DcguCa4BoD_i0ZWBr7W9QmslkEqT2ZPeMpec_uKnruuV-oVz_Qctf1MkDN4evrohKf0wh5BumO98h4gSK7hFLPUpcXFp03d9jleYtdU-BPdf_bHatEd-a6-p6Fn-8n23uuEdg_-7OYffOSHzB1o9DrRhwxf3amu9i67YQ48_US3SS33ufR54Gd53J2YSye6GV8w-mTf_FtGH9Tf9ab49-I6swuLe6oM_W68Hcj7xF62kyveTnHcr4ll21anQ1mPbHIezxZ7dU_cz_e6GOtmN64M9-F3ih4Mx3nxeJEmEfHjiOx3I_aG43yLmHldpxlaQ_-9Pe0b3fhdShClrh96zN6Yk7Yyd6d3dZHVpfHqZtexYs0C_1v-5dtZ3jv9qAuLA01dG8hFqyWISpS_tNKzYsmLmMdiJpdsEcc88EPfn-2XURFugxC32YJhFG9zyUSGkaRUSBFlLJqpJVLkjNGQ0SCmvhdlzN8Gki2ySAjMM-JTeRSq9MryfLQpzpTWrVyGLGA4K0UmS-3-IiFmIj_IqiD87e2tyff2H4G9UASxkj_BuY2Ss8xN3airMKqurDRIZ83Sxphn7U4Tn5ZKG32LapQp3X-xATtIQbdZfTLqKEro8bTDs78WjT67n0Ldr9141jbl8mHbldm3mZfXR4JrG6ef5qemfpe5Ibh2OWuCa1ftPwEAAP__vZh2SQ">