<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/91863>91863</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
clang-18: Aarch64: macos: memset pattern & always_inline attribute prevents copy elision of float constants in Neon code
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
angushewlett
</td>
</tr>
</table>
<pre>
clang 18.1 (homebrew) generates memset_pattern16 function calls when assigning a float to multiple Neon f32x4 elements in an array. This causes a serious performance regression in the scenario outlined below.
clang 17 (homebrew) and clang 18 (trunk, 18.1.0rc, aarch64-unknown-linux-gnu) do not do this, and instead perform copy elision which generates much more performant code.
The behaviour only seems to happen when __attribute__((always_inline)) is set.
The two output examples below demonstrate the bug. You can see that the second output example is much less performant, due to larger size and calls out to memset etc.
clang 17 does not demonstrate this behaviour.
Compile with:
```
clang simdtest2.cpp -std=c++20 -stdlib=libc++ -O3 -funroll-loops -g -target aarch64-unknown-macos -o test.o && objdump -dS test.o
```
Example program:
```
#include <arm_neon.h>
#define force_inline_unroll 0
#if force_inline_unroll
#define simd_forceinline inline __attribute__((always_inline, nodebug))
#define unroll_n __attribute__((opencl_unroll_hint))
#else
#define simd_forceinline inline
#define unroll_n
#endif
// Define an array-of-Neon-vector class with a conversion from float, and a multiply operator.
template <int N> class alignas(16) vf
{
public:
float32x4_t m[N];
simd_forceinline vf (float x)
{
unroll_n for (int i = 0; i < N; i++) m[i] = vdupq_n_f32(x); // <= correct: detect potential copy elision. incorrect: memset_pattern16
}
simd_forceinline vf(const vf& q1, const vf& q2, float32x4_t(*oper)(float32x4_t, float32x4_t)) { unroll_n for (int i = 0; i < N; i++) m[i] = oper(q1.m[i], q2.m[i]); }
simd_forceinline const vf& operator*=(const vf<N>& other)
{
unroll_n for (int i = 0; i < N; i++) m[i] = vmulq_f32(m[i], other.m[i]);
return *this;
}
simd_forceinline vf operator*(const vf& m2) const { return vf (*this, m2, vmulq_f32); }
};
// a 4-element type instance of vf
typedef vf<4> simd;
simd dosomething (simd a, simd b, float c, float d)
{
return (a * 5.f) * (b * 3.f) * c;
}
int main()
{
return 1;
}
```
Output with #define force_inline_unroll 0:
```
test.o: file format mach-o arm64
Disassembly of section __TEXT,__text:
0000000100003f50 <__Z11dosomething2vfILi4EES0_ff>:
; {
100003f50: bd4003f0 ldr s16, [sp]
; simd_forceinline vf(const vf& q1, const vf& q2, float32x4_t(*oper)(float32x4_t, float32x4_t)) { unroll_n for (int i = 0; i < N; i++) m[i] = oper(q1.m[i], q2.m[i]); }
100003f54: 4f00f691 fmov.4s v17, #5.00000000
100003f58: 6e31dc00 fmul.4s v0, v0, v17
100003f5c: 6e31dc21 fmul.4s v1, v1, v17
100003f60: 6e31dc42 fmul.4s v2, v2, v17
100003f64: 6e31dc63 fmul.4s v3, v3, v17
100003f68: 4f00f511 fmov.4s v17, #3.00000000
100003f6c: 6e31dc84 fmul.4s v4, v4, v17
100003f70: 6e31dca5 fmul.4s v5, v5, v17
100003f74: 6e31dcc6 fmul.4s v6, v6, v17
100003f78: 6e31dce7 fmul.4s v7, v7, v17
100003f7c: 6e24dc00 fmul.4s v0, v0, v4
100003f80: 6e25dc21 fmul.4s v1, v1, v5
100003f84: 6e26dc42 fmul.4s v2, v2, v6
100003f88: 6e27dc63 fmul.4s v3, v3, v7
100003f8c: 4f909000 fmul.4s v0, v0, v16[0]
100003f90: 4f909021 fmul.4s v1, v1, v16[0]
100003f94: 4f909042 fmul.4s v2, v2, v16[0]
100003f98: 4f909063 fmul.4s v3, v3, v16[0]
; return (a * 5.f) * (b * 3.f) * c;
100003f9c: d65f03c0 ret
0000000100003fa0 <_main>:
; return 1;
100003fa0: 52800020 mov w0, #1
100003fa4: d65f03c0 ret
```
Output with #define force_inline_unroll 1:
```
test.o: file format mach-o arm64
Disassembly of section __TEXT,__text:
0000000100003e78 <__Z11dosomething2vfILi4EES0_ff>:
; {
100003e78: d104c3ff sub sp, sp, #304
100003e7c: a9116ffc stp x28, x27, [sp, #272]
100003e80: a9127bfd stp x29, x30, [sp, #288]
100003e84: 910483fd add x29, sp, #288
100003e88: ad039fe6 stp q6, q7, [sp, #112]
100003e8c: ad0217e4 stp q4, q5, [sp, #64]
100003e90: ad000fe2 stp q2, q3, [sp]
100003e94: ad0103e1 stp q1, q0, [sp, #32]
100003e98: bd4013a0 ldr s0, [x29, #16]
100003e9c: 3d801be0 str q0, [sp, #96]
; return (a * 5.f) * (b * 3.f) * c;
100003ea0: 90000001 adrp x1, 0x100003000 <__Z11dosomething2vfILi4EES0_ff+0x28>
100003ea4: 913e0021 add x1, x1, #3968
100003ea8: d10143a0 sub x0, x29, #80
100003eac: 52800802 mov w2, #64
100003eb0: 94000031 bl 0x100003f74 <_memset_pattern16+0x100003f74>
100003eb4: ad7d87a0 ldp q0, q1, [x29, #-80]
100003eb8: 3dc00fe2 ldr q2, [sp, #48]
100003ebc: 6e20dc40 fmul.4s v0, v2, v0
100003ec0: 3d800fe0 str q0, [sp, #48]
100003ec4: 3dc00be0 ldr q0, [sp, #32]
100003ec8: 6e21dc00 fmul.4s v0, v0, v1
100003ecc: 3d800be0 str q0, [sp, #32]
; return (a * 5.f) * (b * 3.f) * c;
100003ed0: ad7e87a0 ldp q0, q1, [x29, #-48]
100003ed4: 3dc003e2 ldr q2, [sp]
100003ed8: 6e20dc40 fmul.4s v0, v2, v0
100003edc: 3d8003e0 str q0, [sp]
100003ee0: 3dc007e0 ldr q0, [sp, #16]
100003ee4: 6e21dc00 fmul.4s v0, v0, v1
100003ee8: 3d8007e0 str q0, [sp, #16]
; return (a * 5.f) * (b * 3.f) * c;
100003eec: 90000001 adrp x1, 0x100003000 <__Z11dosomething2vfILi4EES0_ff+0x74>
100003ef0: 913e4021 add x1, x1, #3984
100003ef4: 910243e0 add x0, sp, #144
100003ef8: 52800802 mov w2, #64
100003efc: 9400001e bl 0x100003f74 <_memset_pattern16+0x100003f74>
100003f00: ad4487e0 ldp q0, q1, [sp, #144]
100003f04: 3dc013e2 ldr q2, [sp, #64]
100003f08: 6e20dc40 fmul.4s v0, v2, v0
100003f0c: 3dc017e2 ldr q2, [sp, #80]
100003f10: 6e21dc41 fmul.4s v1, v2, v1
100003f14: ad458fe2 ldp q2, q3, [sp, #176]
100003f18: 3dc01fe4 ldr q4, [sp, #112]
; return (a * 5.f) * (b * 3.f) * c;
100003f1c: 6e22dc82 fmul.4s v2, v4, v2
100003f20: 3dc023e4 ldr q4, [sp, #128]
100003f24: 6e23dc83 fmul.4s v3, v4, v3
100003f28: 3dc00fe4 ldr q4, [sp, #48]
100003f2c: 6e20dc80 fmul.4s v0, v4, v0
100003f30: 3dc00be4 ldr q4, [sp, #32]
100003f34: 6e21dc81 fmul.4s v1, v4, v1
100003f38: 3dc003e4 ldr q4, [sp]
100003f3c: 6e22dc82 fmul.4s v2, v4, v2
100003f40: 3dc007e4 ldr q4, [sp, #16]
100003f44: 6e23dc83 fmul.4s v3, v4, v3
100003f48: 3dc01be4 ldr q4, [sp, #96]
100003f4c: 4f849000 fmul.4s v0, v0, v4[0]
100003f50: 4f849021 fmul.4s v1, v1, v4[0]
100003f54: 4f849042 fmul.4s v2, v2, v4[0]
100003f58: 4f849063 fmul.4s v3, v3, v4[0]
100003f5c: a9527bfd ldp x29, x30, [sp, #288]
100003f60: a9516ffc ldp x28, x27, [sp, #272]
100003f64: 9104c3ff add sp, sp, #304
100003f68: d65f03c0 ret
0000000100003f6c <_main>:
; return 1;
100003f6c: 52800020 mov w0, #1
100003f70: d65f03c0 ret
Disassembly of section __TEXT,__stubs:
0000000100003f74 <__stubs>:
100003f74: b0000010 adrp x16, 0x100004000 <__stubs+0x4>
100003f78: f9400210 ldr x16, [x16]
100003f7c: d61f0200 br x16
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzkWkuPozoW_jWujZXIrxBY1KKrqlsaaXTv4vZiZjaRMXbCvYApbOoxv37kB6-QVLp7ajHStFoUAZ_3Od85GLgx5bGR8h7sHsDu6Y739qS7e94ce3OSr5W09i7Xxfu9qHhzhDjdYghIetK1zDv5CkgGj7KRHbfSwFrWRtpDy62VXYMTqPpG2FI3UPCqMvD1JBsYJJbNEXKoKs0ttBrWfWXLtpLwN6kbqCh5Y1BWspaNNbBsIG8g7zr-voXfT6WBgvdGGsihkV2pewNb2Snd1bwREnby2EljnNiygfYkoRGy4V2poe5tVTaygLms9OsWoCeAvoRjtG9_bh1vCjjY7u7Zrm_-AuTRu2KLOuHOOe_EKWGbvvmr0a_Npiqb_m1zbHrHodCw0db9safS-OVNAcvGWMmLQXModPsOZVV6vV9PpTjNHduLE6x1JydDLRS6kAsTvp8kzOWJv5S676BuqndopKyNc_CJt61sQgQOB25tV-a9lYcDICkgKa9e-bs5lI1zDyCZ07s00Ei7kmBfvR_b3kL5xuu2kia4Exay1o2xTmXv9rw_buE_dQ8Fb5wm0J5ctF1ApNBNccbGCfSGVtLMImqdx4peOisq3h1lB035bxkC49NK9yGHfPZBacXlwBZamhCJhZ6lmZy2IHzUdVtWEr6W9gRovAgSFP_PmJuyLqw0lmxF28KNsQWgTwKQB0AeCPIXqjIH9Kkq83gZbn6ncKP6ptNVtam0bg3cHCHcWGeiXWVUzYU2cKOhk7PVEJAEkATq_M-ir1u4Kf6Idy6qGY5fo5_bTh87Xk82XSQhtGxE1RcSAvrIu_rQSN1sT4B-XVARWkhVNhIq3QkZM-gQ7ILobGmpLi0LtyGEcOIGoffqwS8Pq2H8czt9H2GjC5n3x5DJV_gH4YeL9aBb2Ygq6nc4lS4LJ1aAUFkZeYnvFaVvqDCxbYpSLZwW73wD5Bt8CoQDGG602ji83LxIYXXnYMoYn66QQ6GbF9l5NFGdrgPUDuDDB8B9h7p1GKOH1LeybitXGIA-lo2FvwH6NTLmVXlsuAEkxYnDh5dB0f1DOGn7vCrFmFbOQC_VofnBwhrsHn4DuydAH6YFK3-9KIeyoTG8LYO3n9EtAqh054icviUE9AkiQB_86aMz4AGWoeic1k6LEuye_LqXom-fD81BUQJI6sXRBxjC5D0O6KNbJ3TXSWEB_QILaaWwsNVWNrbk1QK4t7BsZmvP--HclqfZj4-8AUgqHFr50wQ-YxfD-RX4TNylmad9Cn9xgfVJmy5una30SA_2D5_iziAyfcbb4bKT90xmP72DL1m_Mn1u45CkgHwB9GnuEvroMtSvsadg8IWE-YRMqfvqOebJ3Dgv9cy-KLSTtu8aCMgX3_fnWf_j4V-YvkyFmjgtw5UYwigy1NAgmDz6pY9zG5ZhcCeDegvA4ZBt4hgG7Xsr_dTihyytxvp3NwqpQjSYwwtnxxlDdwkW2uha2pMb_gBJ_TXuNPNn-ZicUEynxYS6-5VnU-7cC3db5dOYfHHXcn9Cp2ti0mW02B9dGtS8bLyzVlJGd-IL9Jf66-9hnPEADG81xisDRWzi7m6m3PThhyCnpjhtNORdnbC5zKfScGNknTsoV26w8vP24fD96z--A_J4OFj5Zs9aPQr_sDtQtUMu_Q-Hf2E8CxB5UX_7e8m-fv0DHZRyNTby8KkTPTLycHiXF8z9QCGrs6roAMqM6xaPEOweTOtKZOTx_wh4g7uYcxdTCKkkw9FdqtYvW2YAyl7w3ruM0N02xgot6VNHn0iKC4HQSN9XkR75cg9HvF-SiomU4DUpDkSXSBM0kTKyJg0gQy6Ssok0oWtS6onoRdJ09NUOf-QretlXyczglK1FMy-UXRK9nxnMd2vSnSfaXSSdGSySNamviXg8J50FV-7XpN7eeDwnjbYSdjsv2IIyjaaS3e202C0po6UkuZ0VyZIyGkr2PimuZ8TSyFSEhMhQhm4mfwJ2D2iEncghQxOHmzVwmQObONwshcsc0onDh8afkXs4Qdkvt8BBvHdikewUomJwYift9TbBQ5vw_fKsG6z65EjihOxIihAig5BavwCUvaJYtXhJwRZqLXS61XI_7rj4f6fjyn36X3dcGVCiwIgJqlT0relzd2z9TNUOuIjYktKHnmcYJ0oJT2ZbgLI310Uf4RvZT_06cCB7cpa-MuAFzzDZ56oYxEc-medD0YpPmq74-IBnGLGUjnx4UUx8FtQLUu8BXiCaKZlM8p89qj6vrMB4bYWILAjeS7aw4tl3hOfdOZeEnTMJaMILhJCSZMnEI8AzXQ9AAy2LtBhRiWeEHnyeVx6kKxMCjrjJC1N-NnkN5NGTzgfJit67gBYpwrlEo_aOfi0-Sz4Zh2SAiCyWxxj_zieSdwJ6C0sd1t-uGvKAXB4Pm1SDlJhlVKIJ8GOWeSHh6BycJcss40OdYTb5N9TZGwr1Mjg3RUtKMaJfisgS_ciUTXOSPHiD-V-DnnkFUDZ4Qe1ZgOHzrQVn-Gz2WNqfxzTbF-l-liTtGOSQb4tU2aTnPUvmacgVMc_0kGsh0-e5wla1ng_TCSoEu9q4SWzfc0qBhiRF6maSrgULNuo95XjU-wdKTAxzSpi2P5g2lmRjZaHblTWT-kmVVURY2sufiPnad8XkO_pBzM-p0kWofzTOxeQy-oHLzoRJNKq4vxneNQJK9kvhlemo6_5mePGnA6cUnw-ca-BQaABO5oDzOmqmSyBTQ1snbApkJEWLro7ZGWX606ipxISaWH4OZCoUy4ex1IX3Wu0s7Dgb8dFYOvij0rkyVyiU_iJcKiRGwXtJPpK6wniF0VQN7OqzEVkXhMKxybBdOu8Ol-eg6LT9eTUqPPYYrCSb6c4-muc-6akIDx2KFCK9-ljHogvmlGTEIEJvqE3OMVaRAYFoIdKruyMsPhQuKOcdmS1TbCV5Be6KzDpyehn62IX8omjWT29JXfVTRWeAm15NMXYhxWg6a0ZXvHwu7JdDyuZt5ZaVq7ai2DKoPxpRNuX_bd9ma6lxcyRltzdH2KV9iR2aGNzaG7nMgE0Mbm2NXGaQTgxu7RVeZhAfcHfzZ9MARD_zbBr3O3m2Cw_KSz4__qwcNz-z5aN6aIYfP6rHvc-f2aVJxNVdmksvNBZ7pD-zURO2Ri9v1PzgNomxfW4-fDMRG_ewcjJnscmaR5LzGSiZDUFsHIICL9f6Vy0_bKkoN0gQvJwj38aXGG_rSt_HvTSsEBlrLh_Irm9e3RX3tMhoxu_kPd7jHdsljOC70z3LUJHlGd1RhBjJCkIFkooVSYZ5xhN8V94TRBjaYYwzN5htc0wlxgnFe06wZAowJGteVtuqeqm3ujvelcb08j7DaULvKp7LyvhPvghp5Cv0NwFxmXvX3TuaTd4fDWCoKo01Exdb2kqGb8E2oVN_CR-q-NfdXGgzvfeGcdKCgCRw8YUGHL-3gG0nX_xHXovvn7QaXgRq_8IxfATmvw0TupB3fVfdn6xtffr4V5XH0p76fCt0Dcg3p278s2k7_acUFpBv3kgDyDfvhP8EAAD__5c-OwQ">