<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/127134>127134</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[AArch64] auto-vectorizer regression
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
appujee
</td>
</tr>
</table>
<pre>
Baseline: c4c5e79dd4b4c78eee7cffd9b0d7394b5bedcf12
Regressing version: 3c92011b600bdf70424e2547594dd461fe411a41 or more recently f142f8afe21bceb00fb495468aa0b5043e98c419
Command to repro:
```
$CC -c -nostdlibinc -D__BIONIC_NO_PAGE_SIZE_MACRO -O2 -DANDROID -DNDEBUG -UDEBUG -D__compiler_offsetof=__builtin_offsetof -D__ANDROID_UNAVAILABLE_SYMBOLS_ARE_WEAK__ -faddrsig -fcolor-diagnostics -ffp-contract=off -fno-exceptions -fno-strict-aliasing -fmessage-length=0 -gsimple-template-names -gz=zstd -no-canonical-prefixes -ftrivial-auto-var-init=zero -ffunction-sections -fdata-sections -fno-short-enums -funwind-tables -fstack-protector-strong -Wa,--noexecstack -D_FORTIFY_SOURCE=2 -Wstrict-aliasing=2 -march=armv8.2-a -mcpu=cortex-a55 -target aarch64-linux-android10000 -DANDROID_STRICT -fPIE -Wimplicit-fallthrough -D_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS -Wno-gnu-include-next -fvisibility-inlines-hidden -Isystem/media/audio_utils/benchmarks -D__LIBC_API__=10000 -D__LIBM_API__=10000 -D__LIBDL_API__=10000 -Iexternal/google-benchmark/include -Iprebuilts/clang/host/linux-x86/clang-${cver}/android_libc++/platform/aarch64/include/c++/v1 -Iprebuilts/clang/host/linux-x86/clang-${cver}/include/c++/v1 -Ibionic/libc/async_safe/include -Isystem/logging/liblog/include -Ibionic/libc/system_properties/include -Isystem/core/property_service/libpropertyinfoparser/include -Ibionic/libdl/include_private -isystem bionic/libc/include -isystem bionic/libc/kernel/uapi/asm-arm64 -isystem bionic/libc/kernel/uapi -isystem bionic/libc/kernel/android/scsi -isystem bionic/libc/kernel/android/uapi -fsanitize=memtag-heap -fsanitize-trap=all -std=gnu++20 -fno-rtti -Isystem/core/include -Isystem/logging/liblog/include -Isystem/media/audio/include -Ihardware/libhardware/include -Ihardware/libhardware_legacy/include -Ihardware/ril/include -Iframeworks/native/include -Iframeworks/native/opengl/include -Iframeworks/av/include -o audio_vectorization_benchmark.o system/media/audio_utils/benchmarks/audio_vectorization_benchmark.cpp --save-temps
```
$ grep '[u|f]cvt' audio_vectorization_benchmark_new.s
24
$ grep '[u|f]cvt' audio_vectorization_benchmark_baseline.s
42
Haven't reduced it yet, but here is the full source code.
```cpp
#include <functional>
#include <random>
#include <vector>
#include <benchmark/benchmark.h>
// A small subset of code from audio_utils/intrinsic_utils.h
// We conditionally include neon optimizations for ARM devices
#pragma push_macro("USE_NEON")
#undef USE_NEON
#if defined(__ARM_NEON__) || defined(__aarch64__)
#include <arm_neon.h>
#define USE_NEON
#endif
template <typename T>
inline constexpr bool dependent_false_v = false;
// Type of array embedded in a struct that is usable in the Neon template functions below.
// This type must satisfy std::is_array_v<>.
template<typename T, size_t N>
struct internal_array_t {
T v[N];
static constexpr size_t size() { return N; }
};
#ifdef USE_NEON
template<int N>
struct vfloat_struct {};
template<int N>
using vfloat_t = typename vfloat_struct<N>::t; // typnemae required for Android 14 and earlier.
template<typename F, int N>
using vector_hw_t = std::conditional_t<
std::is_same_v<F, float>, vfloat_t<N>, internal_array_t<F, N>>;
#else
// use loop vectorization if no HW type exists.
template<typename F, int N>
using vector_hw_t = internal_array_t<F, N>;
#endif
template<typename T>
static inline T vmul(T a, T b) {
if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
return a * b;
#ifdef USE_NEON
} else if constexpr (std::is_same_v<T, float32x2_t>) {
return vmul_f32(a, b);
} else if constexpr (std::is_same_v<T, float32x4_t>) {
return vmulq_f32(a, b);
#if defined(__aarch64__)
} else if constexpr (std::is_same_v<T, float64x2_t>) {
return vmulq_f64(a, b);
#endif
#endif // USE_NEON
} else /* constexpr */ {
T ret;
auto &[retval] = ret; // single-member struct
const auto &[aval] = a;
const auto &[bval] = b;
if constexpr (std::is_array_v<decltype(retval)>) {
#pragma unroll
for (size_t i = 0; i < std::size(aval); ++i) {
retval[i] = vmul(aval[i], bval[i]);
}
return ret;
} else /* constexpr */ {
auto &[r1, r2] = retval;
const auto &[a1, a2] = aval;
const auto &[b1, b2] = bval;
r1 = vmul(a1, b1);
r2 = vmul(a2, b2);
return ret;
}
}
}
#pragma pop_macro("USE_NEON")
// end intrinsics subset
static constexpr size_t kDataSize = 2048;
static void TestArgs(benchmark::internal::Benchmark* b) {
constexpr int kChannelCountMin = 1;
constexpr int kChannelCountMax = 32;
for (int i = kChannelCountMin; i <= kChannelCountMax; ++i) {
b->Args({i});
}
}
// Macro test operator
#define OPERATOR(N) \
*reinterpret_cast<V<F, N>*>(out) = vmul( \
*reinterpret_cast<const V<F, N>*>(in1), \
*reinterpret_cast<const V<F, N>*>(in2)); \
out += N; \
in1 += N; \
in2 += N;
// Macro to instantiate switch case statements.
#define INSTANTIATE(N) \
case N: \
mFunc = [](F* out, const F* in1, const F* in2, size_t count) { \
static_assert(sizeof(V<F, N>) == N * sizeof(F)); \
for (size_t i = 0; i < count; ++i) { \
OPERATOR(N); \
} \
}; \
break;
template <typename Traits>
class Processor {
public:
// shorthand aliases
using F = typename Traits::data_t;
template <typename T, int N>
using V = typename Traits::template container_t<T, N>;
Processor(int channelCount)
: mChannelCount(channelCount) {
if constexpr (Traits::loop_) {
mFunc = [channelCount](F* out, const F* in1, const F* in2, size_t count) {
for (size_t i = 0; i < count; ++i) {
for (size_t j = 0; j < channelCount; ++j) {
OPERATOR(1);
}
}
};
return;
}
switch (channelCount) {
INSTANTIATE(1);
INSTANTIATE(2);
INSTANTIATE(3);
INSTANTIATE(4);
INSTANTIATE(5);
INSTANTIATE(6);
INSTANTIATE(7);
INSTANTIATE(8);
INSTANTIATE(9);
INSTANTIATE(10);
INSTANTIATE(11);
INSTANTIATE(12);
INSTANTIATE(13);
INSTANTIATE(14);
INSTANTIATE(15);
INSTANTIATE(16);
INSTANTIATE(17);
INSTANTIATE(18);
INSTANTIATE(19);
INSTANTIATE(20);
INSTANTIATE(21);
INSTANTIATE(22);
INSTANTIATE(23);
INSTANTIATE(24);
INSTANTIATE(25);
INSTANTIATE(26);
INSTANTIATE(27);
INSTANTIATE(28);
INSTANTIATE(29);
INSTANTIATE(30);
INSTANTIATE(31);
INSTANTIATE(32);
}
}
void process(F* out, const F* in1, const F* in2, size_t frames) {
mFunc(out, in1, in2, frames);
}
const size_t mChannelCount;
/* const */ std::function<void(F*, const F*, const F*, size_t)> mFunc;
};
template <typename Traits>
static void BM_VectorTest(benchmark::State& state) {
using F = typename Traits::data_t;
const size_t channelCount = state.range(0);
std::vector<F> input1(kDataSize * channelCount);
std::vector<F> input2(kDataSize * channelCount);
std::vector<F> output(kDataSize * channelCount);
// Initialize input buffer and coefs with deterministic pseudo-random values
std::minstd_rand gen(42);
const F amplitude = 1.;
std::uniform_real_distribution<> dis(-amplitude, amplitude);
for (auto& in : input1) {
in = dis(gen);
}
for (auto& in : input2) {
in = dis(gen);
}
Processor<Traits> processor(channelCount);
// Run the test
while (state.KeepRunning()) {
benchmark::DoNotOptimize(input1.data());
benchmark::DoNotOptimize(input2.data());
benchmark::DoNotOptimize(output.data());
processor.process(output.data(), input1.data(), input2.data(), kDataSize);
benchmark::ClobberMemory();
}
state.SetComplexityN(channelCount);
}
// Clang has an issue with -frelaxed-template-template-args where
// it may not follow the C++17 guidelines. Use a traits struct to
// pass in parameters.
// Test using two loops.
struct LoopFloatTraits {
template <typename F, int N>
using container_t = internal_array_t<F, N>;
using data_t = float;
static constexpr bool loop_ = true;
};
static void BM_VectorTestLoopFloat(benchmark::State& state) {
BM_VectorTest<LoopFloatTraits>(state);
}
BENCHMARK(BM_VectorTestLoopFloat)->Apply(TestArgs);
BENCHMARK_MAIN();
```
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJysOltz2jjbv0a50TgDMgRykQtzepfZhnQSup39bjyy_Ri0lSW_kkyS_vpvJBljcwjp9s10miA9es5HCao12wiABzScoOHshlZmK9UDLcvqH4CbRGbvDxOqgTMBKIxwOkiHMLrPskEySEdjABileZ7dJ71sFN4PkmECWZr3CepFz7BRoDUTG7wDpZkUFkGY3pNev5_c9XpJlo96AzIAMhyMhveDLBvc9XMY9Pt00MdS4UIqwApSEIa_47w_IPmY5kD6SQpJr5cng_vh4G5MaS8Z9gYh3I_TQf8e9SLUi6ayKKjIsJFYQakkCu0quuvV_3oRIoPpFAcpDoTUJuMsYSLFGAezOJ4sn1bLabx6ir9G_5nHL8v_m8eP0fT5CQdPBAezaDV7flrOcDBbzeaTb__Bwbf69yyOU1mUjIOKZZ5rMDJH4SyOk4pxw0Sz6EBrPPG3VfRXtPwSTb7M45e_HydPX17i6Hkef59Hf8YxDnKaZUqzDQ7yVHKpgozRjWWbpRoHeV4GqRRG0dSgcCbzHAe5kAG8pVAaJoX2n7VRLDUB5Yw6swR5AVrTDQQcxMZsUTjr4WCjWVFyCAwUJacGAkEL0DjY_ETh7Kc2mVVYkFIhBUspD0oFOXuzELlRbMcoD2hlZLCjKmCCWY5-gpKWzUqklp1AQ7pnK6OGtj9bNrdSmQBEVdiFSrwykQWGJtzR0IamP4JSSQOpkcoKJa0s3yki0yAQEt4gdUBWw4un5_Vy8Xf88vTteTpH4Yzg4PuRHtwqDgqqUqsCqord-JYEFAdFWlYonKVSGXgL6HCIA0PVBgymFvhuEHAmqreAikxJlvV7vV7v4B3xy_p5OV3jIP-6nOPgu1UrS5kJcsq52SpZbbaWxy_LyfTr13i-cuZf__E8j2bxS7SYr_-Oo9XqaR2tl0-rFxx8FzLYiCpgIuVVBoGAN4ODfMc0Sxhn5j1gwgaqDrYsy0DgYKnftYECkUUBGaOILGiVMRlXhnGNyCIBkW4Lqn5o546Wkzj6uoxjFM720rjlx_PLsy_H60t4M6AE5YgsNlJuOAQNEUQWNes4WJYKXEhYNlJOxQaRxVZqg8jCK_VtfLffChAZoNEk3YFCo5mVwis85ixJEZm4fwvrrblUVtraPAeCFlMDuOv_Lv2LWBNmo8LhsJwtqH4XaaxpDh3hG7NwudkwR5uzhMtNB-oYmT8Vl0qWoAwDfR5nKpWlVoO9xxrUjqXg8exXmchlSZUGdZFkxg9bcanYjhrAAfN08DFzDY5LAD9ACbAoK1oyp5oioKq4G3zyyCfAarewqkr1rx3wFHJNBTPsJ6BwVkBh6CbYAi1bG4FRtLRZgnMcaJOhcLYRlXcC0vMJTBnDzhjkV81_PnY7IFuqsleqatu2Pl0DiTlsaPp-CVIx3tnKFS3gVaof1uUENWwHn9iXJYjNB4jorrWHA4l9ctq5xM5-UlsT4iZ73Er86WzWbF3ClZYlDgJNd77O6ePewLUHeKOgxIiM0HBSodE0R8NZujOIjD7mNBbwemtRksFvIkrq1sthG5Cas170B92BQGRksIKsSiHDzOB3MIhMcVIZvAUFmGlstoDzinOsZaVSwKnM4LYta1qWjsVwbwYUTvdlmnIUzk92FRWZLM7teCHqnePNdhE4mGHbgl4gssAR1oUNLV0lGgyWuWMZ50oWuGtsJoxiQrPUr9xuO3i-W1lFxrwc_B3vWREgBZalYUWtbY1zqXD0_IgzsGlSe9ZLRTcFxWWlt3FBUyURGSNCvr3M49X8aYUIQeTeg1Yigxw3O43wOc4gZwIyRMZxHD0_uv04RuQeo9EUjaYdgLpmuf0T9VFVxJb1RmMk9Gc7dEkIImO5Z2Hfv9nj5r0E28bhtT_u-wSrIm3grVQ4kZLjDEoQGQgT55RriHcYhTPs_kbhpKPf9XsJ1jpUKfqOoUggy6wXCkyxNqpKDTZbaqwPVtq2bnbLeuPK6r9hbe9qGifA5etti8DWuq-lUlTaYE0N0_k7dhk3QmHEdOxoxzsUTlE4v21J3BWYTLFmPyE2eOWFr_ljwjcqNR6D0cjKiDHGa7xDw8kKDWdebrumDTUsbamsRmp_Od-wRp1gBaZSwpKaYNsr9CL7_0F7IctPvaXFOBPHfO5yLqmJ60-WyRbC8ycrP3X5g8ZZsVFIBx0Kp-6IU6mxPONa_-a9FFBQO3_9t2IKMh8mvl7i_gDb4Qqo4gzU7QkvDbWFVf8pZy5TxNvXmrnGqq2YjS1zte69_g-W17QAZ3iH3glk0ZNpI_NeME-9Y-f9MS_4_GAbH0DW2dueXmnAXMoSd3I0ZjkWEv_x3fsovDFt9CUf_JwSPubz4EBnQvxMhNfuWgf6Gu-KiiMyXmM7JuE1TmqHrTXM8pZnIzI-q-11W9v7FHYZMpNVwsFZoU3K_tRRQjEiEU4-DA8LjkYzbA3zq2yG5I3EpstBTdrqI85DgsjYacTq4xDsv0FwcEywK7Kl-9_zhE9rxlFJ-Nds3Q1O9XCWLTsynWGrcbn6z32S6GaxjuIcRNTh0kbUEfW1pX9Qu_2hlZEYkTs0nCgwO8rRcObiw0M2CcrGEIegsLVH1TXHonEU21hoCwft0jqBTVqwSRf2I5UfKlEGKbehiMi45t6qsaX3Q2tRCSU5txRsZrUYfUFhjnrPymr_bEVYXWroHu0E-7mDnZq1ZV4n0oTtxaoTAT0sO3O3P953Ja_terRSO86J_X7RA05s3rfsKNKyumXNkzi1rYOmDTQ9wLZ_Tu3sziXNueRwTvU7avKA_bNKwYp0YEmNtIH9QEeHeKlbhH0C3Deesvy472wKFAjbd9XNsK47Zw9xqWf5MaOGvrCf4NgnvcG4ScH1kZ2t8mvQJlIbjcj40L57n69rlf80OfT20XFhOdC2BfDHdEuFAD6VlTCPTDj6_YNyPoKmbw46JAfwOnAsrI-aY_xNEJ1u0rcPAygJUDivhUejCXP3Pnu7nljNGeLRmgsb0AbLEhS1w9DeqnWz_vR1_hytn54RGa8czaFrcRCJFDidlgpMnFJtq_9fnQ6ARO7_sayMO3nwuz0W605nEXn3P4-OCefbZPqbWJxb1kmpQSQr4zQczuqGuNlhon9xh7R3zmlYYia0ocIwO0PoV2bSLU6pBtekQwHC92Id1S9XL-totV5G6_mR9q3f2cMrFEattWJRidTp2b-NIDJ2Cc0ZYFrnFLfidHi0QlpzR2odbj8g1AR8nMVUa1CmTv4yR2R8pGBnaacM1ys1YItz-m7FxKVi4nk58fwjJPbnyFdPKdlM33IaN5W0FhIF9MfJoNKdRhVlRvuGNeVUa_xVyRS0tjK4cCyrhLPUP940lX8rldna-cPd4LuJ3RL0bfWiO-7sSdg8lVFD41YqvjAhd5v1A-a_LmNuMKVSGMoEKNe_r0_6d4utkbFOXWkrLdWN3l7DYYSLaWd7fAS9bysuNyptPu0gE5_vFjru3iHyv3L9T7Q6F7zzTGNz6ur_HJD945G1hWhw_vMxziPP7x8n_WPY09VmPG8v-k7gLKY6f1007R6um8BajHU3yEmr0t0Pr-wPLiEeXjl4d2V_dAnx-MrB-yv7_d4lzP3Tvu0I4Jq2-uFF3INrR68prH93Effo2tFrOuvfX_SP3pWj5JrSCLmI-5pzkWtKI8OLuK_5F7mmNDK-iPuai4XXlBZejMfw1MVO2n__wfXcpS8O_z7juhcWfZo-XH7f94_TPZr6bHOo2-C2mvI9-m45OlyYtIa9_aDXzK37W14UTnfutc3J1pXi9JMn6OfnmntfRc9cf17sKtrjzOQx_svdudm55nSmebG9IyJ3vonsKvCXW4uOztp5vb7vpAZuFRUbO88ffMs3hrXW9s8p04VVABNlZfqIjFuzm1X4Ue_Quq6-jIb8NhpZmdL60WfQtD1kgZeCGUa5PeF4wUmV56DcdXIqIdf4lZktzsCAKphg2pqv1FBlMvBPT3hHedU0fQ2DhZ0JstjC4A0IW8m6kVd7F6ZFyZnx7yoz3L89I2wlWC5VESugPM6YNoolVe3BVvyM2QANGkzuEuLwYU-07k9oZaT1KzfuRo0ljwO0noY9bidB90KyO_OeQ0r-FdKTrjScNhG0T0euV_2EdZ8r_8pjh-B653XLOPgLM-v0fwKUz5UQ7tV77IeY49G7G5czuZLmyb_YgWuZrf5ubcQ1GGqZPnOS_JuT3t_Pnmw0dHtI3WfAp_gM3_UiOVpsguoif1MukwTUIxRSvSMyPucrXt8vYKayKDm8MfO-umjFo6uMKadig7dUYyow07oCH5ZBroDTN8gO3w5r_qBqo_HrFhQc8DCDC_qOhTQ4l5zLV-cdU9-J90d4U7HMPXDrW4y_acAUG-d7zSOiPCAr7YDIBC6prVYG1GHI90-GoE2dqc2rdI82DqJG9UXKcsElNd67W053toocP9rUmFsD3idfbQ71wxcJ_6jqX1Emh6uA48dYN6r5gqMqOCp9F8taI-Qv1LduXQynR4ryNzz7Q0f-Mpmvpn88Rs9_IjK-xMi9u0orS2499XCnWKNqMMSP0XLV9uXmSwo32UOY3Yf39AYe-qPwfjAe3I3GN9uHu3F_PArDdEjhbjRMCRnTrEdGIaEkpzDOb9gD6ZFhj_RDQvr9_vg2SUZZPiT3SXbX792FORr0oKCM33K-K26l2tw4b3_ok1E_HNxwmgDX7suxhAh49bGACEHD2Y16sIeCpNpoNOhx9wLYoDHMcPet2ijyX0UbzrD_WmT9jAgKq_obslLcVIo_bI0pXTvh3HnDzLZKblPpvifEd_tfQankP5AaRBaOGY3IouZ290D-PwAA___bGAaC">