<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/55254>55254</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
AArch64 vfma_laneq_f16 uses dup instead of lane
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
fbarchard
</td>
</tr>
</table>
<pre>
clang is not using lanes for vfma_laneq_f16 intrinsic
Instead of dup+fmla
3df54: 73 00 c0 3d ldr q19, [x3]
3df74: 64 06 02 0e dup v4.4h, v19.h[0]
3df78: 65 06 0a 0e dup v5.4h, v19.h[2]
...
3dff0: 9f 1c a4 4e mov v31.16b, v4.16b
3dff4: 7f 0f 45 0e fmla v31.4h, v27.4h, v5.4h
Instead of fmla with lane
3df54: 73 00 c0 3d ldr q4, [x3]
..
3df74: 64 06 02 0e dup v31.4h, v4.h[0]
3dff4: 7f 0f 45 0e fmla v31.4h, v27.4h, v4.h[2]
This function: https://github.com/google/XNNPACK/blob/master/src/f16-dwconv2d-chw/gen/3x3p1-minmax-neonfp16arith-1x4.c
llvm-objdump -d --mcpu=cortex-a76 3x3p1-minmax-neonfp16arith-1x4.o
0000000000000000 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4>:
0: 68 40 00 91 add x8, x3, #16
4: 15 e4 00 2f movi d21, #0000000000000000
8: e2 20 40 fc ldur d2, [x7, #2]
c: e3 00 40 fd ldr d3, [x7]
10: 73 00 c0 3d ldr q19, [x3]
14: 15 81 40 0d ld1 { v21.s }[0], [x8]
18: e0 0c 40 fd ldr d0, [x7, #24]
1c: 29 1c 00 91 add x9, x1, #7
20: 01 e4 00 2f movi d1, #0000000000000000
24: 28 f1 7d 92 and x8, x9, #0xfffffffffffffff8
28: 42 04 02 0e dup v2.4h, v2.h[0]
2c: 63 04 02 0e dup v3.4h, v3.h[0]
30: 64 06 02 0e dup v4.4h, v19.h[0]
34: 65 06 0a 0e dup v5.4h, v19.h[2]
38: 66 06 16 0e dup v6.4h, v19.h[5]
3c: 67 06 06 0e dup v7.4h, v19.h[1]
40: 70 06 12 0e dup v16.4h, v19.h[4]
44: 71 06 1e 0e dup v17.4h, v19.h[7]
48: 72 06 0e 0e dup v18.4h, v19.h[3]
4c: 73 06 1a 0e dup v19.4h, v19.h[6]
50: b4 06 02 0e dup v20.4h, v21.h[0]
54: b5 06 06 0e dup v21.4h, v21.h[1]
58: ea 03 04 aa mov x10, x4
5c: 49 00 01 8b add x9, x2, x1
60: 1f 08 00 f1 cmp x0, #2
64: 89 30 89 9a csel x9, x4, x9, lo
68: 56 01 40 fd ldr d22, [x10]
6c: 57 84 40 fc ldr d23, [x2], #8
70: 38 85 40 fc ldr d24, [x9], #8
74: 3f 24 00 f1 cmp x1, #9
78: 63 05 00 54 b.lo 0x124 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x124>
7c: eb 03 1f aa mov x11, xzr
80: 4c 21 00 91 add x12, x10, #8
84: 1c e4 00 2f movi d28, #0000000000000000
88: 1d e4 00 2f movi d29, #0000000000000000
8c: 1e e4 00 2f movi d30, #0000000000000000
90: ea 03 01 aa mov x10, x1
94: db 1e b6 4e mov v27.16b, v22.16b
98: 9f 1c a4 4e mov v31.16b, v4.16b
9c: fa 1e b7 4e mov v26.16b, v23.16b
a0: 7f 0f 45 0e fmla v31.4h, v27.4h, v5.4h
a4: 19 1f b8 4e mov v25.16b, v24.16b
a8: 96 69 6b fc ldr d22, [x12, x11]
ac: 5f 0f 46 0e fmla v31.4h, v26.4h, v6.4h
b0: 57 68 6b fc ldr d23, [x2, x11]
b4: 9c 33 1b 2e ext v28.8b, v28.8b, v27.8b, #6
b8: 9f 0e 59 0e fmla v31.4h, v20.4h, v25.4h
bc: 38 69 6b fc ldr d24, [x9, x11]
c0: bd 33 1a 2e ext v29.8b, v29.8b, v26.8b, #6
c4: 9f 0f 47 0e fmla v31.4h, v28.4h, v7.4h
c8: de 33 19 2e ext v30.8b, v30.8b, v25.8b, #6
cc: bf 0f 50 0e fmla v31.4h, v29.4h, v16.4h
d0: 7c 13 16 2e ext v28.8b, v27.8b, v22.8b, #2
d4: df 0f 51 0e fmla v31.4h, v30.4h, v17.4h
d8: 5d 13 17 2e ext v29.8b, v26.8b, v23.8b, #2
dc: 9f 0f 52 0e fmla v31.4h, v28.4h, v18.4h
e0: 3e 13 18 2e ext v30.8b, v25.8b, v24.8b, #2
e4: bf 0f 53 0e fmla v31.4h, v29.4h, v19.4h
e8: bf 0e 5e 0e fmla v31.4h, v21.4h, v30.4h
ec: fc 37 43 0e fmax v28.4h, v31.4h, v3.4h
f0: 4a 21 00 d1 sub x10, x10, #8
f4: 9c 37 c2 0e fmin v28.4h, v28.4h, v2.4h
f8: 5f 21 00 f1 cmp x10, #8
fc: bc 68 2b fc str d28, [x5, x11]
100: 6b 21 00 91 add x11, x11, #8
104: 7c 1f bb 4e mov v28.16b, v27.16b
108: 5d 1f ba 4e mov v29.16b, v26.16b
10c: 3e 1f b9 4e mov v30.16b, v25.16b
110: 28 fc ff 54 b.hi 0x94 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x94>
114: 29 01 0b 8b add x9, x9, x11
118: 42 00 0b 8b add x2, x2, x11
11c: a5 00 0b 8b add x5, x5, x11
120: 05 00 00 14 b 0x134 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x134>
124: ea 03 01 aa mov x10, x1
128: 19 e4 00 2f movi d25, #0000000000000000
12c: 1a e4 00 2f movi d26, #0000000000000000
130: 1b e4 00 2f movi d27, #0000000000000000
134: 16 1c 36 0e and v22.8b, v0.8b, v22.8b
138: 9c 1c a4 4e mov v28.16b, v4.16b
13c: 17 1c 37 0e and v23.8b, v0.8b, v23.8b
140: dc 0e 45 0e fmla v28.4h, v22.4h, v5.4h
144: 18 1c 38 0e and v24.8b, v0.8b, v24.8b
148: fc 0e 46 0e fmla v28.4h, v23.4h, v6.4h
14c: 7b 33 16 2e ext v27.8b, v27.8b, v22.8b, #6
150: 9c 0e 58 0e fmla v28.4h, v20.4h, v24.4h
154: 5a 33 17 2e ext v26.8b, v26.8b, v23.8b, #6
158: 7c 0f 47 0e fmla v28.4h, v27.4h, v7.4h
15c: 39 33 18 2e ext v25.8b, v25.8b, v24.8b, #6
160: 5c 0f 50 0e fmla v28.4h, v26.4h, v16.4h
164: d6 12 01 2e ext v22.8b, v22.8b, v1.8b, #2
168: 3c 0f 51 0e fmla v28.4h, v25.4h, v17.4h
16c: f7 12 01 2e ext v23.8b, v23.8b, v1.8b, #2
170: dc 0e 52 0e fmla v28.4h, v22.4h, v18.4h
174: 18 13 01 2e ext v24.8b, v24.8b, v1.8b, #2
178: fc 0e 53 0e fmla v28.4h, v23.4h, v19.4h
17c: bc 0e 58 0e fmla v28.4h, v21.4h, v24.4h
180: 96 37 43 0e fmax v22.4h, v28.4h, v3.4h
184: 5f 21 00 f1 cmp x10, #8
188: d6 36 c2 0e fmin v22.4h, v22.4h, v2.4h
18c: e1 00 00 54 [b.ne](https://www.google.com/url?q=http://b.ne&sa=D&source=buganizer&usg=AOvVaw19sy2HnPy8XzNXzW9mLJje) 0x1a8 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x1a8>
190: b6 84 00 fc str d22, [x5], #8
194: 4a 00 08 cb sub x10, x2, x8
198: 00 04 00 f1 subs x0, x0, #1
19c: 22 01 08 cb sub x2, x9, x8
1a0: e1 f5 ff 54 [b.ne](https://www.google.com/url?q=http://b.ne&sa=D&source=buganizer&usg=AOvVaw19sy2HnPy8XzNXzW9mLJje) 0x5c <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x5c>
1a4: 07 00 00 14 b 0x1c0 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x1c0>
1a8: 6a 00 10 36 tbz w10, #2, 0x1b4 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x1b4>
1ac: b6 80 9f 0d st1 { v22.s }[0], [x5], #4
1b0: d6 22 16 2e ext v22.8b, v22.8b, v22.8b, #4
1b4: 0a ff 0f 36 tbz w10, #1, 0x194 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x194>
1b8: b6 40 9f 0d st1 { v22.h }[0], [x5], #2
1bc: f6 ff ff 17 b 0x194 <xnn_f16_dwconv2d_chw_ukernel_3x3p1__neonfp16arith_1x4+0x194>
1c0: c0 03 5f d6 ret
clang --version
Android (7284624, based on r416183b) clang version 12.0.5 (https://android.googlesource.com/toolchain/llvm-project c935d99d7cf2016289302412d708641d52d2f7ee)
Target: x86_64-unknown-linux-gnu
Thread model: posix
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzNWklz28oR_jXUZYqs2TAYHHiQrbiylfMOqcQ3FVYRfiQgA6RE69enZxrrYCAzTzmE5aIhgl_v2zSY1NnPfXqMqydStqSqz-TSlvAHfJK3pKgb8lKc4kfz54_HgilSVuemrNoy3dCHDb3H979U7TmPM1IXJLs8b_in4nSMp18ghIisCORG3JNQEEpJSuETsnhtaHTMGnj_waIN_0w2waer2AQPUzKhJaMkoYpQTmjuJWMEodGL3MmDIfTCot0BqFGXmLbEAkss_gWxwCHGB2K73W5KtaCGalQQlpJYEumneqpfDFXBdkwllq60V1NCaLKC0ILIYE08NLel1AnIw_7Kyuz3lYGR1_J8sO7-Q_6CF_gL3n_IpbvmNrnBbfACS8P7RBPpddsNdoGXVZB4zSIdB-L7Pw-QBcWlSs9lXRkOh_P5uYWLDf8C_57AVpdkl9Yn80ddPx1zuPj29etv95__BlfJsQY_fjnFYOIGLtomhXdIm232mtbVC8-26eHVYPMK3sVVPLPtqaxO8XVb5XVVPDMVN8Bky65yN0ux4_HltK2T79nl9Ey2GdluT-nzZSMe0ro559dtHCryC3r1lB51XmQjPl-ryuT4Yy_sIwj7ePk9b6r8-GiJPz7OyD4C2Y34k7FP7xvzssGvNJHURE7E4KM4w-C5amN9iBETLFwwNQNap7KA5NIAeQEfQY6UNi446zCu5DMKNptzTjg13IvUxuelQQp9hIYdJT4NKvNKLdrGu0FnQ3QDWozoOYrRZZb0qLUqZmC9sppZQyGMYUqHnyBW2a6Fq4c--js62qWDKgOFdCk0XagsXbjVmUemUi2cZYW_9oYPpzhutabM66tbXMWt_lyTgpEwIxE3jKtZlEQ9mWsxf-kZIWsACRVFDnUFqwikPh8yf1lGDNZqr4QXK3qs8GIFXdazAftu2zFguWw7A3i9zXRg7FnKgKElz8HKAQcuGFUOLWcXHDpg5oAlxjq1nF2dmcvajTWJRZtZdO6iXd5unkmrdcg7qR20dtBuusm0z1Lg7RocMHO0ctCB1Tvx-5rTIciY19nYR5PAa3LOHLRr8wATHGS2YRrHmGmYKMxmOFThKcCqKiOTl5CgOlmmNO8Se4JSVkUGHVUbYGEKQXpCIa90qJhThFVLR5AJ5j0ygqVtfpzwkWMeH-sZ1ioVKCPgomrxoVIz15TK6haERMtJge9xQ43mfcHkYlYrQquk0EQHHvgwwkRrcKuxKKB4LW3UF71ohtB9gQkMIpDwWbI71uYevTKg84f7Lv9kCZj-O2WIPSwx0QLOdKLFynh9a6YIbW0iU8LZsgOwLlKozxwaW1jq79f6hiagrXlY5qcQ3ULB6gvVxEdB0BsoRHSSX8yfX7NMiazaWWK4JgqH-x5gJsx-mOd8Ns0bpF4eCwbk6jHAAK2WRWxZhi5LNbIULjKmy0EZx-KVwXg8L3QE0MmRiaZEu6yDkfVC6Bi1VURFRCVuso053oWYW_diTHWUXK1LPnQd5Uie0K5WwCi65D-pFV72idU7SomAREoIN-zz67nTW-90p_Z4FXZXEG-zsTbpvQ4qBNG6ImMbcX2QpF3Z8lpyUra8mqTYvDKrSexqEg3yj1dqRZNU9pqAS8J1TYZmHDqKpNYUWW5FiRxRBO0FGK_AFCuiWJMkVpSArosydnY3PDLMjJQwYaaoVQeHwxUfZZk1wgwLAsrCVmURg4OZa5cM22FmZQnXXaSGK7EmSzq6KOA3uAgnpwmFHHtkbmXRqz4aPGMy3y9LLic-Erf4KHJl0T0FSJ18nYJj4ykFrJyQxlA2Bxnia-_lHjdScAjgKkfGXXu0J7T2kth7Q3vwtsdiqCAhSQdXlJXDenLlstZdBUTWzrjh54lpkZqax7tK0Z77SqH7ShF4KwWjeKRJ_KMA60FLvozKPpmgSyRul9BjlwidLsHoEPqAjF1kNCLVApn2gQrIyO2mdEQGLhKP7ObsmZKi6Keygx0a6DX64FAWOTMZw6M-nLFhvKCJdyAfSvcUNhxs6RLGp3P8HGbNEgdeGDo-8MC6Az3CKGHWJhhUZlIVH51UhWsVXADcPncxPOhD2_AOi8GvRz2Gx33ogV4K6gYKeOiHecBLIbyFAs5TysyAohtr-r3H2GJe6LzrTAnorqz4hshJqrnzGMOTP7QXZkuSw1ksOQuHM57-s9RAF7PkpIwNSxdnjmG4AICmYiTQrgRyKYF0JdBdNTcSuDPhRIJhdaNcCXANkNgBZNH0w_eb_nQAYbgSiKwogV4XZZzqpCMLrgWC2MqyaPrq_aY_l0V31dc3l01kGYb80BUFS2lkRXF7_qTTe3v-TBRcIwSpdy6biDLuilwX4VohwwUTc2XhC8e8MP_4wXDHIFLvWDYRZVy3uWbBbUMR-kURC7-sihJOMmcxlfkyx53KWDikjvCIIhd-WRdlkkKLocyXQu5QxsJ-xvhF5LPVyMeVA5wMvUMZX05GwiUg_9vJiOGqAcIKiu5yGuNLF3CXJ25WWNcg7dAAs1Syq3K7LtLzZ0Wvr687fETUPS-6NMeN-PJjIx7MF4fvWTxXbQw3HsxFfWnSHP5ILk9xVb6ZB0nq0j7BJ_f_ePlX_Mqi9if_c_XbT_3t7eu3t39Hp7__9TuQiIyU0Ghj_cFObc7ts06NG5JEmaUbpe5QORzkA__WjOG-BCZoYzhNUjNVuBM0DjJzmHWYwYzLNoC1Foao3tGzMQF3Jdxm7IIdn0xbM264JwHvFsEwEv4fehcK64ecawr9zLe43qHhytiXfuDBoA0ms3yYM8SNqI0GRk02EnJO3uzNVzYumuF_gCcfHTsTd-zEtZIJZmqPyZkN5ulzN-597jaJ7um2neGWCeoKRNxyovC0q-lEMaeErohN_EHLWrEM6yzz0VMKWxxTcE9lNprrljm8b5lZp8GtVaGMPvAPRpxZaP3vFcBFF4QsnCigN2TGgE1-nj72xp-6bLcvedOWdYW62Tv3VdbUZQZa6JBrqXCtlsRtnhH4YiOZYlokJhORSE-C8R3dBWRRImIk2JUJTPyuWJzr-pge4tL8CMA-2H9u6u95eiZpJIIsirIwLThliutIUC4Zz0KqlWRZwDNehLmpB90PFuLmCTQEta9aPSq5vVS_V_VrtT2W1eW6faou_Q8bGvOjj1Od5Ufz7ee6La946y7biywSUXx3Ls_HfH9_36QHJd0f_lzavLVPrcrxByTm7h1Uvv07P5Mw-jlqwp9l217yFi6CgAfy7rAXYaGzQmpVMJlzKROl8oLlcUFzJWQu7o5xkh_bPQTchvMqfyWWBFxD9N2Ve045pwEVHIbhQO1EHgRBVkQ5WFEAxY2k-Skujzsjx65unu6avRUJ6nALN49le27Hm3Hblk9Vnlt2QD--nA91sy-SGGwTN9mdZb63wv8HgFIbQA">