<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/55438>55438</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[Aarch64][SVE] Bad code generation of llvm.fmuladd.* for SVE
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
jsetoain
</td>
</tr>
</table>
<pre>
I'm trying to compile code that does this:
c += a * broadcast(b[0])
For fixed length vectors, if I use this:
```
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #3
define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
%ap = bitcast float* %arg0 to <4 x float>*
%bp = bitcast float* %arg1 to <4 x float>*
%cp = bitcast float* %arg2 to <4 x float>*
%a = load <4 x float>, <4 x float>* %ap
%b = load <4 x float>, <4 x float>* %bp
%c = load <4 x float>, <4 x float>* %cp
%b0splat = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
%mad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b0splat, <4 x float> %c) #3
store <4 x float> %mad, <4 x float>* %cp
ret void
}
```
And, when I compile it, I get this assembly:
```
ldr q0, [x1] // Load b
ldr q1, [x0] // Load a
ldr q2, [x2] // Load c
fmla v2.4s, v1.4s, v0.s[0] // mad = c + a * splat(b[i])
str q2, [x2] // store mad in c
ret
```
Which looks good to me, although one might argue that it should not need to load the whole vector `b` if it's going to splat its first element.
But, if I write a scalable version of the same code:
```
declare <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) #3
define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
%ap = bitcast float* %arg0 to <vscale x 4 x float>*
%bp = bitcast float* %arg1 to <vscale x 4 x float>*
%cp = bitcast float* %arg2 to <vscale x 4 x float>*
%a = load <vscale x 4 x float>, <vscale x 4 x float>* %ap
%b = load <vscale x 4 x float>, <vscale x 4 x float>* %bp
%c = load <vscale x 4 x float>, <vscale x 4 x float>* %cp
%b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
%mad = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b0splat, <vscale x 4 x float> %c) #3
store <vscale x 4 x float> %mad, <vscale x 4 x float>* %cp
ret void
}
```
I obtain this:
```
ptrue p0.s
ld1w { z0.s }, p0/z, [x0] // Load a
ld1w { z1.s }, p0/z, [x1] // Load b
ld1w { z2.s }, p0/z, [x2] // Load c
mov z1.s, s1 // Splat b[0]
fmad z0.s, p0/m, z1.s, z2.s // mad = a + (splat b[0]) * c
st1w { z0.s }, p0, [x2] // store mad in c
ret
```
Which is semantically incorrect (it's reading the arguments of `fmuladd` in the wrong order) and instead of using the indexed version of fmad is using a splat, which might have performance implications.
Additionally, if I try to use vscale_range to generate SVE from fixed-length vector code:
```
declare <8 x float> @llvm.fmuladd.v4f32(<8 x float>, <8 x float>, <8 x float>) #3
define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {
%ap = bitcast float* %arg0 to <8 x float>*
%bp = bitcast float* %arg1 to <8 x float>*
%cp = bitcast float* %arg2 to <8 x float>*
%a = load <8 x float>, <8 x float>* %ap
%b = load <8 x float>, <8 x float>* %bp
%c = load <8 x float>, <8 x float>* %cp
%b0splat = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer
%mad = call <8 x float> @llvm.fmuladd.v4f32(<8 x float> %a, <8 x float> %b0splat, <8 x float> %c) #3
store <8 x float> %mad, <8 x float>* %cp
ret void
}
attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
attributes #3 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
```
I get this:
```
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
sub x9, sp, #48
and sp, x9, #0xffffffffffffffe0
ptrue p0.s
ld1w { z0.s }, p0/z, [x0] // Load a
ld1w { z1.s }, p0/z, [x1] // Load b
ld1w { z2.s }, p0/z, [x2] // Load c
stp s1, s1, [sp, #24] // Splat b[0] to the stack
stp s1, s1, [sp, #16] // Splat b[0] to the stack
stp s1, s1, [sp, #8] // Splat b[0] to the stack
stp s1, s1, [sp] // Splat b[0] to the stack
ld1w { z1.s }, p0/z, [sp] // Load splatted b[0] from the stack
fmad z0.s, p0/m, z1.s, z2.s // mad = splat(b[0]) * c + a
st1w { z0.s }, p0, [x2] // Store mad in c
mov sp, x29
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
```
Which introduces additional performance issues by doing the splat through the stack, instead of simply `mov z1.s, s1`, or something to that effect.
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzlWUuPozgQ_jXkYnXEI68-5JBMpqWW9rQt7R5HBkzwDmDWNnnMr98qAwkQSJj07mG0UU8gYH9VLrvq--zxRXhev1vuMiVannm2J1qQQKQ5TxhcQ0Z0TDUJBVNwx5XlbSx7Z9mbgFju1vJ2hMLNhvhS0DCgSlvuyrfmW9ua7yz3tWxbfr8JSSJ-YiFJWLbXMTmwQAupLPcL4RF5J4VibRvWwq7-zM-QBQmVjFjelxk5kSgRVFveV2LN7CQ5pNMoLRIahtPDLPJc8KPTDu2MefQKA_K8puchi3jGyEHwEI1F-bdAZAcmNVxTH159CyRVMZg0KBgQy51TubfRQOeZc3mGcSufuZVVm1jLbWmTVB9skBMMtM81Brjb18YZuxnE5gbFv4vijEQJ7qK491Bux2WgoFk4al5KI_ntwJ6A8W9hgidggnxgaL6t8gRnHSBVXERRwsrVfrt4oXGPAVJksOoaLzgsaXj8g0nBM645TTjcD5hPzSB2JKBJ8my6mGj3eXYd3sDboC-Hau8UBKEnhY3PjwNdo0imTTpWFpa7pqlO2dhkBvgYswyKTF3buPH-newBCYsOoUqx1E_OQ9UHzSahJH-bpIYSd3KgxpHbj-W-wR_5DVeS3-nr1H3tx31pp69b93Uf9w2ufaM0oXg9uNOZqbUHp76xp6oq1Zfel5WD5b2q7dVcm7rOm3W9nM_xrpVzjyZ41nQRZvPO_P0Z8yCGxBTfFdkLEWKNSRlapImORbGPiYDqnPJ9rAkUoaLiLK4h9USRhCQTmmSMmZ4mwXXMYD0IWAZ1Vi5sH_4hD-HCWKKligzLROZaAXlJKHksYSnL9LTp8bbQFxY7Sq4ZRE5B8lHfmJCKi4yIyNhVNC2ZdQTNHRCEQTrcTeDs1Ejh3i5VYj3x7hclw4HRPMuKo-HG0eN9uAc8-cQkjmPOZ4EfcumzwM-x60DOXGm2v0GTb1stPkW8_1YCN8l4eIAtVh5sNo6eB7tfeXrM3NW4Q4TdLn3l9zsRvqZAEe3NwBAp51pCyYcr0lmDMp2jmZTllvyANwRtgt85lKG3H10aHuTdBogzBOL0gPgDIO4QiNsD0qDIVByMB9hcOT0UW3f8MJlx2YI1ZAAAwgdjcbGe4l0Na3zr6gBqdACsTNXGNUto0_RQ6eGAd8XBaD3QtzxKPQCKDfQazTTHXDsDQiCkhCqAzlYcLhkNDYsD7aIwQN5WyMOAWOWeIf2sFARSQFshQ8huGB7N0C2lAQO7FKpG4lApcBfb4HUTXPCobETJJQ-PxtdSmsT0wEjOZCQk-B0AUJon4L0GENWSE5sw5PgUB3bRFbA9R_rATXKZeN8kzfYMn-1ZxiQF1fHxx1cSSZGW--yX1j67ozkeKI_VyB3D6rakP3z0i2qK9iCelRKPUMYpiF6UB8Lh4byMkwnjYB6KgnEwz0mA1QDzr4YIf_VZnn8mXZqcfuNxi8q7b8cxeLfXlbjvBPoRX5tvqrXkfqGZqnIKIoFlv1mXYLy4K6wSz9WweJl-iRjVhWTANxDsHX67W3WAxi652Gijexf0TGC0fRp8h9tIMgYXdc4CuBTZEaqyqfcZ7gVVzgKIvTb7ryNPEhhOITPySHfUhwH3zgCUzskJdsAQypNXc5vKzY3rvTgLQ49Niq7Yzlm8-Gco0m8iCYE-PnLwq03wFaxqzIUqfHKqn5YmZqvraySp8kXZCKfjFLU-7P8mlXCClFMKpe70uLP2AUWvaMIaa3bqGtbaOFwz6_8B7qpznvJ53NvzmZ_BHDXLFyvNCTL1TLPwasEIlR4bTyrV5ilVS6KWJ1mfEqofA0IVk7ZKv-aZWBL2l4j57rJc-magUyB-Z8iVP6mKMy1FWARQO-lFRLZFp1IFvPXPJBS1oi2pVMfSHKZdpwSl51UBK9SrZ9TO7c0IOgI3QL1KpAzKZ3luZs7gGNSfQE8n4doLX71XOtFcJ2wN4dhQGcQLTEf4AboVQ77FRMb_c6oEbaWuW2SKUwqDQak7KWSyjrXOTb02MdxzHRf-FFQj_MB-1eUll-Iv8AR-lgGAm_l85q0m8Tr0Q282j1YL31lSNgsZ8-353F3ajHkLe2ZPEuqzRKHTwFMZO5YxRPKa7yZ87dqua88dz1m6C9ee0le69NzF4tWfUdtdOKAGYJ_Ck6kZhpD7iVwbl_xir1AqcKXV9SVViu8zZmKE-LTQsZDrvxTTAvbEE2N7bXz_Bz_5n4Q">