<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/55438>55438</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            [Aarch64][SVE] Bad code generation of llvm.fmuladd.* for SVE

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          jsetoain

      </td>

    </tr>

</table>

<pre>

    I'm trying to compile code that does this:

c += a * broadcast(b[0])

For fixed length vectors, if I use this:

```

declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #3

define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {

        %ap = bitcast float* %arg0 to <4 x float>*

        %bp = bitcast float* %arg1 to <4 x float>*

        %cp = bitcast float* %arg2 to <4 x float>*

        %a = load <4 x float>, <4 x float>* %ap

        %b = load <4 x float>, <4 x float>* %bp

        %c = load <4 x float>, <4 x float>* %cp

        %b0splat = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer

        %mad = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %a, <4 x float> %b0splat, <4 x float> %c) #3

        store <4 x float> %mad, <4 x float>* %cp

        ret void

}

```

And, when I compile it, I get this assembly:

```

    ldr q0, [x1]                   // Load b

    ldr q1, [x0]                   // Load a

    ldr q2, [x2]                   // Load c

    fmla    v2.4s, v1.4s, v0.s[0]  // mad = c + a * splat(b[i])

    str q2, [x2]                   // store mad in c

    ret

```

Which looks good to me, although one might argue that it should not need to load the whole vector `b` if it's going to splat its first element.

But, if I write a scalable version of the same code:

```

declare <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) #3

define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {

        %ap = bitcast float* %arg0 to <vscale x 4 x float>*

        %bp = bitcast float* %arg1 to <vscale x 4 x float>*

        %cp = bitcast float* %arg2 to <vscale x 4 x float>*

        %a = load <vscale x 4 x float>, <vscale x 4 x float>* %ap

        %b = load <vscale x 4 x float>, <vscale x 4 x float>* %bp

        %c = load <vscale x 4 x float>, <vscale x 4 x float>* %cp

        %b0splat = shufflevector <vscale x 4 x float> %b, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer

        %mad = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b0splat, <vscale x 4 x float> %c) #3

        store <vscale x 4 x float> %mad, <vscale x 4 x float>* %cp

        ret void

}

```

I obtain this:

```

    ptrue   p0.s

    ld1w    { z0.s }, p0/z, [x0]    // Load a

    ld1w    { z1.s }, p0/z, [x1]    // Load b

    ld1w    { z2.s }, p0/z, [x2]    // Load c

    mov z1.s, s1                    // Splat b[0]

    fmad    z0.s, p0/m, z1.s, z2.s  // mad = a + (splat b[0]) * c

    st1w    { z0.s }, p0, [x2]      // store mad in c

    ret

```

Which is semantically incorrect (it's reading the arguments of `fmuladd` in the wrong order) and instead of using the indexed version of fmad is using a splat, which might have performance implications.

Additionally, if I try to use vscale_range to generate SVE from fixed-length vector code:

```

declare <8 x float> @llvm.fmuladd.v4f32(<8 x float>, <8 x float>, <8 x float>) #3

define void @fp_convert_combine_crash(float *%arg0, float *%arg1, float* %arg2) #0 {

        %ap = bitcast float* %arg0 to <8 x float>*

        %bp = bitcast float* %arg1 to <8 x float>*

        %cp = bitcast float* %arg2 to <8 x float>*

        %a = load <8 x float>, <8 x float>* %ap

        %b = load <8 x float>, <8 x float>* %bp

        %c = load <8 x float>, <8 x float>* %cp

        %b0splat = shufflevector <8 x float> %b, <8 x float> undef, <8 x i32> zeroinitializer

        %mad = call <8 x float> @llvm.fmuladd.v4f32(<8 x float> %a, <8 x float> %b0splat, <8 x float> %c) #3

        store <8 x float> %mad, <8 x float>* %cp

        ret void

}

attributes #0 = { vscale_range(2,2) "target-features"="+sve" }

attributes #3 = { nocallback nofree nosync nounwind readnone speculatable willreturn }

```

I get this:

```

    stp x29, x30, [sp, #-16]!           // 16-byte Folded Spill

    mov x29, sp

    sub x9, sp, #48

    and sp, x9, #0xffffffffffffffe0

    ptrue   p0.s

    ld1w    { z0.s }, p0/z, [x0]    // Load a

    ld1w    { z1.s }, p0/z, [x1]    // Load b

    ld1w    { z2.s }, p0/z, [x2]    // Load c

    stp s1, s1, [sp, #24]           // Splat b[0] to the stack

    stp s1, s1, [sp, #16]           // Splat b[0] to the stack

    stp s1, s1, [sp, #8]            // Splat b[0] to the stack

    stp s1, s1, [sp]                // Splat b[0] to the stack

    ld1w    { z1.s }, p0/z, [sp]    // Load splatted b[0] from the stack

    fmad    z0.s, p0/m, z1.s, z2.s  // mad = splat(b[0]) * c + a

    st1w    { z0.s }, p0, [x2]      // Store mad in c

    mov sp, x29

    ldp x29, x30, [sp], #16             // 16-byte Folded Reload

    ret

```

Which introduces additional performance issues by doing the splat through the stack, instead of simply `mov z1.s, s1`, or something to that effect.

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzlWUuPozgQ_jXkYnXEI68-5JBMpqWW9rQt7R5HBkzwDmDWNnnMr98qAwkQSJj07mG0UU8gYH9VLrvq--zxRXhev1vuMiVannm2J1qQQKQ5TxhcQ0Z0TDUJBVNwx5XlbSx7Z9mbgFju1vJ2hMLNhvhS0DCgSlvuyrfmW9ua7yz3tWxbfr8JSSJ-YiFJWLbXMTmwQAupLPcL4RF5J4VibRvWwq7-zM-QBQmVjFjelxk5kSgRVFveV2LN7CQ5pNMoLRIahtPDLPJc8KPTDu2MefQKA_K8puchi3jGyEHwEI1F-bdAZAcmNVxTH159CyRVMZg0KBgQy51TubfRQOeZc3mGcSufuZVVm1jLbWmTVB9skBMMtM81Brjb18YZuxnE5gbFv4vijEQJ7qK491Bux2WgoFk4al5KI_ntwJ6A8W9hgidggnxgaL6t8gRnHSBVXERRwsrVfrt4oXGPAVJksOoaLzgsaXj8g0nBM645TTjcD5hPzSB2JKBJ8my6mGj3eXYd3sDboC-Hau8UBKEnhY3PjwNdo0imTTpWFpa7pqlO2dhkBvgYswyKTF3buPH-newBCYsOoUqx1E_OQ9UHzSahJH-bpIYSd3KgxpHbj-W-wR_5DVeS3-nr1H3tx31pp69b93Uf9w2ufaM0oXg9uNOZqbUHp76xp6oq1Zfel5WD5b2q7dVcm7rOm3W9nM_xrpVzjyZ41nQRZvPO_P0Z8yCGxBTfFdkLEWKNSRlapImORbGPiYDqnPJ9rAkUoaLiLK4h9USRhCQTmmSMmZ4mwXXMYD0IWAZ1Vi5sH_4hD-HCWKKligzLROZaAXlJKHksYSnL9LTp8bbQFxY7Sq4ZRE5B8lHfmJCKi4yIyNhVNC2ZdQTNHRCEQTrcTeDs1Ejh3i5VYj3x7hclw4HRPMuKo-HG0eN9uAc8-cQkjmPOZ4EfcumzwM-x60DOXGm2v0GTb1stPkW8_1YCN8l4eIAtVh5sNo6eB7tfeXrM3NW4Q4TdLn3l9zsRvqZAEe3NwBAp51pCyYcr0lmDMp2jmZTllvyANwRtgt85lKG3H10aHuTdBogzBOL0gPgDIO4QiNsD0qDIVByMB9hcOT0UW3f8MJlx2YI1ZAAAwgdjcbGe4l0Na3zr6gBqdACsTNXGNUto0_RQ6eGAd8XBaD3QtzxKPQCKDfQazTTHXDsDQiCkhCqAzlYcLhkNDYsD7aIwQN5WyMOAWOWeIf2sFARSQFshQ8huGB7N0C2lAQO7FKpG4lApcBfb4HUTXPCobETJJQ-PxtdSmsT0wEjOZCQk-B0AUJon4L0GENWSE5sw5PgUB3bRFbA9R_rATXKZeN8kzfYMn-1ZxiQF1fHxx1cSSZGW--yX1j67ozkeKI_VyB3D6rakP3z0i2qK9iCelRKPUMYpiF6UB8Lh4byMkwnjYB6KgnEwz0mA1QDzr4YIf_VZnn8mXZqcfuNxi8q7b8cxeLfXlbjvBPoRX5tvqrXkfqGZqnIKIoFlv1mXYLy4K6wSz9WweJl-iRjVhWTANxDsHX67W3WAxi652Gijexf0TGC0fRp8h9tIMgYXdc4CuBTZEaqyqfcZ7gVVzgKIvTb7ryNPEhhOITPySHfUhwH3zgCUzskJdsAQypNXc5vKzY3rvTgLQ49Niq7Yzlm8-Gco0m8iCYE-PnLwq03wFaxqzIUqfHKqn5YmZqvraySp8kXZCKfjFLU-7P8mlXCClFMKpe70uLP2AUWvaMIaa3bqGtbaOFwz6_8B7qpznvJ53NvzmZ_BHDXLFyvNCTL1TLPwasEIlR4bTyrV5ilVS6KWJ1mfEqofA0IVk7ZKv-aZWBL2l4j57rJc-magUyB-Z8iVP6mKMy1FWARQO-lFRLZFp1IFvPXPJBS1oi2pVMfSHKZdpwSl51UBK9SrZ9TO7c0IOgI3QL1KpAzKZ3luZs7gGNSfQE8n4doLX71XOtFcJ2wN4dhQGcQLTEf4AboVQ77FRMb_c6oEbaWuW2SKUwqDQak7KWSyjrXOTb02MdxzHRf-FFQj_MB-1eUll-Iv8AR-lgGAm_l85q0m8Tr0Q282j1YL31lSNgsZ8-353F3ajHkLe2ZPEuqzRKHTwFMZO5YxRPKa7yZ87dqua88dz1m6C9ee0le69NzF4tWfUdtdOKAGYJ_Ck6kZhpD7iVwbl_xir1AqcKXV9SVViu8zZmKE-LTQsZDrvxTTAvbEE2N7bXz_Bz_5n4Q">