<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/127134>127134</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [AArch64] auto-vectorizer regression
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          appujee
      </td>
    </tr>
</table>

<pre>
    Baseline: c4c5e79dd4b4c78eee7cffd9b0d7394b5bedcf12
Regressing version: 3c92011b600bdf70424e2547594dd461fe411a41 or more recently f142f8afe21bceb00fb495468aa0b5043e98c419

Command to repro:
```
$CC -c -nostdlibinc   -D__BIONIC_NO_PAGE_SIZE_MACRO -O2 -DANDROID -DNDEBUG -UDEBUG -D__compiler_offsetof=__builtin_offsetof -D__ANDROID_UNAVAILABLE_SYMBOLS_ARE_WEAK__ -faddrsig -fcolor-diagnostics -ffp-contract=off -fno-exceptions -fno-strict-aliasing -fmessage-length=0 -gsimple-template-names -gz=zstd -no-canonical-prefixes -ftrivial-auto-var-init=zero -ffunction-sections -fdata-sections -fno-short-enums -funwind-tables -fstack-protector-strong -Wa,--noexecstack -D_FORTIFY_SOURCE=2 -Wstrict-aliasing=2  -march=armv8.2-a -mcpu=cortex-a55 -target aarch64-linux-android10000 -DANDROID_STRICT -fPIE -Wimplicit-fallthrough -D_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS -Wno-gnu-include-next -fvisibility-inlines-hidden -Isystem/media/audio_utils/benchmarks -D__LIBC_API__=10000 -D__LIBM_API__=10000 -D__LIBDL_API__=10000 -Iexternal/google-benchmark/include -Iprebuilts/clang/host/linux-x86/clang-${cver}/android_libc++/platform/aarch64/include/c++/v1 -Iprebuilts/clang/host/linux-x86/clang-${cver}/include/c++/v1 -Ibionic/libc/async_safe/include -Isystem/logging/liblog/include -Ibionic/libc/system_properties/include -Isystem/core/property_service/libpropertyinfoparser/include -Ibionic/libdl/include_private -isystem bionic/libc/include -isystem bionic/libc/kernel/uapi/asm-arm64 -isystem bionic/libc/kernel/uapi -isystem bionic/libc/kernel/android/scsi -isystem bionic/libc/kernel/android/uapi -fsanitize=memtag-heap -fsanitize-trap=all -std=gnu++20 -fno-rtti -Isystem/core/include -Isystem/logging/liblog/include -Isystem/media/audio/include -Ihardware/libhardware/include -Ihardware/libhardware_legacy/include -Ihardware/ril/include -Iframeworks/native/include -Iframeworks/native/opengl/include -Iframeworks/av/include  -o audio_vectorization_benchmark.o system/media/audio_utils/benchmarks/audio_vectorization_benchmark.cpp --save-temps
```

$ grep '[u|f]cvt' audio_vectorization_benchmark_new.s
24
$ grep '[u|f]cvt' audio_vectorization_benchmark_baseline.s
42


Haven't reduced it yet, but here is the full source code.
```cpp
#include <functional>
#include <random>
#include <vector>

#include <benchmark/benchmark.h>

// A small subset of code from audio_utils/intrinsic_utils.h

// We conditionally include neon optimizations for ARM devices
#pragma push_macro("USE_NEON")
#undef USE_NEON

#if defined(__ARM_NEON__) || defined(__aarch64__)
#include <arm_neon.h>
#define USE_NEON
#endif

template <typename T>
inline constexpr bool dependent_false_v = false;

// Type of array embedded in a struct that is usable in the Neon template functions below.
// This type must satisfy std::is_array_v<>.
template<typename T, size_t N>
struct internal_array_t {
    T v[N];
    static constexpr size_t size() { return N; }
};

#ifdef USE_NEON

template<int N>
struct vfloat_struct {};

template<int N>
using vfloat_t = typename vfloat_struct<N>::t;  // typnemae required for Android 14 and earlier.

template<typename F, int N>
using vector_hw_t = std::conditional_t<
        std::is_same_v<F, float>, vfloat_t<N>, internal_array_t<F, N>>;


#else

// use loop vectorization if no HW type exists.
template<typename F, int N>
using vector_hw_t = internal_array_t<F, N>;

#endif

template<typename T>
static inline T vmul(T a, T b) {
    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
        return a * b;

#ifdef USE_NEON
    } else if constexpr (std::is_same_v<T, float32x2_t>) {
 return vmul_f32(a, b);
    } else if constexpr (std::is_same_v<T, float32x4_t>) {
        return vmulq_f32(a, b);
#if defined(__aarch64__)
 } else if constexpr (std::is_same_v<T, float64x2_t>) {
        return vmulq_f64(a, b);
#endif
#endif // USE_NEON

    } else /* constexpr */ {
        T ret;
        auto &[retval] = ret;  // single-member struct
 const auto &[aval] = a;
        const auto &[bval] = b;
        if constexpr (std::is_array_v<decltype(retval)>) {
#pragma unroll
 for (size_t i = 0; i < std::size(aval); ++i) {
                retval[i] = vmul(aval[i], bval[i]);
            }
            return ret;
        } else /* constexpr */ {
             auto &[r1, r2] = retval;
 const auto &[a1, a2] = aval;
             const auto &[b1, b2] = bval;
 r1 = vmul(a1, b1);
             r2 = vmul(a2, b2);
 return ret;
        }
    }
}

#pragma pop_macro("USE_NEON")

// end intrinsics subset

static constexpr size_t kDataSize = 2048;

static void TestArgs(benchmark::internal::Benchmark* b) {
    constexpr int kChannelCountMin = 1;
    constexpr int kChannelCountMax = 32;
    for (int i = kChannelCountMin; i <= kChannelCountMax; ++i) {
        b->Args({i});
 }
}

// Macro test operator

#define OPERATOR(N) \
 *reinterpret_cast<V<F, N>*>(out) = vmul( \
    *reinterpret_cast<const V<F, N>*>(in1), \
    *reinterpret_cast<const V<F, N>*>(in2)); \
    out += N; \
    in1 += N; \
    in2 += N;

// Macro to instantiate switch case statements.

#define INSTANTIATE(N) \
    case N: \
    mFunc = [](F* out, const F* in1, const F* in2, size_t count) { \
 static_assert(sizeof(V<F, N>) == N * sizeof(F)); \
        for (size_t i = 0; i < count; ++i) { \
            OPERATOR(N); \
        } \
    }; \
    break;

template <typename Traits>
class Processor {
public:
 // shorthand aliases
    using F = typename Traits::data_t;
    template <typename T, int N>
    using V = typename Traits::template container_t<T, N>;

    Processor(int channelCount)
        : mChannelCount(channelCount) {

        if constexpr (Traits::loop_) {
            mFunc = [channelCount](F* out, const F* in1, const F* in2, size_t count) {
 for (size_t i = 0; i < count; ++i) {
                    for (size_t j = 0; j < channelCount; ++j) {
                        OPERATOR(1);
 }
                }
            };
            return;
 }
        switch (channelCount) {
        INSTANTIATE(1);
 INSTANTIATE(2);
        INSTANTIATE(3);
        INSTANTIATE(4);
 INSTANTIATE(5);
        INSTANTIATE(6);
        INSTANTIATE(7);
 INSTANTIATE(8);
        INSTANTIATE(9);
        INSTANTIATE(10);
 INSTANTIATE(11);
        INSTANTIATE(12);
        INSTANTIATE(13);
 INSTANTIATE(14);
        INSTANTIATE(15);
        INSTANTIATE(16);
 INSTANTIATE(17);
        INSTANTIATE(18);
        INSTANTIATE(19);
 INSTANTIATE(20);
        INSTANTIATE(21);
        INSTANTIATE(22);
 INSTANTIATE(23);
        INSTANTIATE(24);
        INSTANTIATE(25);
 INSTANTIATE(26);
        INSTANTIATE(27);
        INSTANTIATE(28);
 INSTANTIATE(29);
        INSTANTIATE(30);
        INSTANTIATE(31);
 INSTANTIATE(32);
        }
    }

    void process(F* out, const F* in1, const F* in2, size_t frames) {
        mFunc(out, in1, in2, frames);
 }

    const size_t mChannelCount;
    /* const */ std::function<void(F*, const F*, const F*, size_t)> mFunc;
};

template <typename Traits>
static void BM_VectorTest(benchmark::State& state) {
    using F = typename Traits::data_t;
    const size_t channelCount = state.range(0);

 std::vector<F> input1(kDataSize * channelCount);
    std::vector<F> input2(kDataSize * channelCount);
    std::vector<F> output(kDataSize * channelCount);

    // Initialize input buffer and coefs with deterministic pseudo-random values
    std::minstd_rand gen(42);
    const F amplitude = 1.;
    std::uniform_real_distribution<> dis(-amplitude, amplitude);
 for (auto& in : input1) {
        in = dis(gen);
    }
    for (auto& in : input2) {
        in = dis(gen);
    }

    Processor<Traits> processor(channelCount);

    // Run the test
    while (state.KeepRunning()) {
        benchmark::DoNotOptimize(input1.data());
 benchmark::DoNotOptimize(input2.data());
 benchmark::DoNotOptimize(output.data());
 processor.process(output.data(), input1.data(), input2.data(), kDataSize);
 benchmark::ClobberMemory();
    }
 state.SetComplexityN(channelCount);
}

// Clang has an issue with -frelaxed-template-template-args where
// it may not follow the C++17 guidelines.  Use a traits struct to
// pass in parameters.

// Test using two loops.
struct LoopFloatTraits {
    template <typename F, int N>
 using container_t = internal_array_t<F, N>;
    using data_t = float;
 static constexpr bool loop_ = true;
};
static void BM_VectorTestLoopFloat(benchmark::State& state) {
 BM_VectorTest<LoopFloatTraits>(state);
}

BENCHMARK(BM_VectorTestLoopFloat)->Apply(TestArgs);
BENCHMARK_MAIN();
```
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJysOltz2jjbv0a50TgDMgRykQtzepfZhnQSup39bjyy_Ri0lSW_kkyS_vpvJBljcwjp9s10miA9es5HCao12wiABzScoOHshlZmK9UDLcvqH4CbRGbvDxOqgTMBKIxwOkiHMLrPskEySEdjABileZ7dJ71sFN4PkmECWZr3CepFz7BRoDUTG7wDpZkUFkGY3pNev5_c9XpJlo96AzIAMhyMhveDLBvc9XMY9Pt00MdS4UIqwApSEIa_47w_IPmY5kD6SQpJr5cng_vh4G5MaS8Z9gYh3I_TQf8e9SLUi6ayKKjIsJFYQakkCu0quuvV_3oRIoPpFAcpDoTUJuMsYSLFGAezOJ4sn1bLabx6ir9G_5nHL8v_m8eP0fT5CQdPBAezaDV7flrOcDBbzeaTb__Bwbf69yyOU1mUjIOKZZ5rMDJH4SyOk4pxw0Sz6EBrPPG3VfRXtPwSTb7M45e_HydPX17i6Hkef59Hf8YxDnKaZUqzDQ7yVHKpgozRjWWbpRoHeV4GqRRG0dSgcCbzHAe5kAG8pVAaJoX2n7VRLDUB5Yw6swR5AVrTDQQcxMZsUTjr4WCjWVFyCAwUJacGAkEL0DjY_ETh7Kc2mVVYkFIhBUspD0oFOXuzELlRbMcoD2hlZLCjKmCCWY5-gpKWzUqklp1AQ7pnK6OGtj9bNrdSmQBEVdiFSrwykQWGJtzR0IamP4JSSQOpkcoKJa0s3yki0yAQEt4gdUBWw4un5_Vy8Xf88vTteTpH4Yzg4PuRHtwqDgqqUqsCqord-JYEFAdFWlYonKVSGXgL6HCIA0PVBgymFvhuEHAmqreAikxJlvV7vV7v4B3xy_p5OV3jIP-6nOPgu1UrS5kJcsq52SpZbbaWxy_LyfTr13i-cuZf__E8j2bxS7SYr_-Oo9XqaR2tl0-rFxx8FzLYiCpgIuVVBoGAN4ODfMc0Sxhn5j1gwgaqDrYsy0DgYKnftYECkUUBGaOILGiVMRlXhnGNyCIBkW4Lqn5o546Wkzj6uoxjFM720rjlx_PLsy_H60t4M6AE5YgsNlJuOAQNEUQWNes4WJYKXEhYNlJOxQaRxVZqg8jCK_VtfLffChAZoNEk3YFCo5mVwis85ixJEZm4fwvrrblUVtraPAeCFlMDuOv_Lv2LWBNmo8LhsJwtqH4XaaxpDh3hG7NwudkwR5uzhMtNB-oYmT8Vl0qWoAwDfR5nKpWlVoO9xxrUjqXg8exXmchlSZUGdZFkxg9bcanYjhrAAfN08DFzDY5LAD9ACbAoK1oyp5oioKq4G3zyyCfAarewqkr1rx3wFHJNBTPsJ6BwVkBh6CbYAi1bG4FRtLRZgnMcaJOhcLYRlXcC0vMJTBnDzhjkV81_PnY7IFuqsleqatu2Pl0DiTlsaPp-CVIx3tnKFS3gVaof1uUENWwHn9iXJYjNB4jorrWHA4l9ctq5xM5-UlsT4iZ73Er86WzWbF3ClZYlDgJNd77O6ePewLUHeKOgxIiM0HBSodE0R8NZujOIjD7mNBbwemtRksFvIkrq1sthG5Cas170B92BQGRksIKsSiHDzOB3MIhMcVIZvAUFmGlstoDzinOsZaVSwKnM4LYta1qWjsVwbwYUTvdlmnIUzk92FRWZLM7teCHqnePNdhE4mGHbgl4gssAR1oUNLV0lGgyWuWMZ50oWuGtsJoxiQrPUr9xuO3i-W1lFxrwc_B3vWREgBZalYUWtbY1zqXD0_IgzsGlSe9ZLRTcFxWWlt3FBUyURGSNCvr3M49X8aYUIQeTeg1Yigxw3O43wOc4gZwIyRMZxHD0_uv04RuQeo9EUjaYdgLpmuf0T9VFVxJb1RmMk9Gc7dEkIImO5Z2Hfv9nj5r0E28bhtT_u-wSrIm3grVQ4kZLjDEoQGQgT55RriHcYhTPs_kbhpKPf9XsJ1jpUKfqOoUggy6wXCkyxNqpKDTZbaqwPVtq2bnbLeuPK6r9hbe9qGifA5etti8DWuq-lUlTaYE0N0_k7dhk3QmHEdOxoxzsUTlE4v21J3BWYTLFmPyE2eOWFr_ljwjcqNR6D0cjKiDHGa7xDw8kKDWdebrumDTUsbamsRmp_Od-wRp1gBaZSwpKaYNsr9CL7_0F7IctPvaXFOBPHfO5yLqmJ60-WyRbC8ycrP3X5g8ZZsVFIBx0Kp-6IU6mxPONa_-a9FFBQO3_9t2IKMh8mvl7i_gDb4Qqo4gzU7QkvDbWFVf8pZy5TxNvXmrnGqq2YjS1zte69_g-W17QAZ3iH3glk0ZNpI_NeME-9Y-f9MS_4_GAbH0DW2dueXmnAXMoSd3I0ZjkWEv_x3fsovDFt9CUf_JwSPubz4EBnQvxMhNfuWgf6Gu-KiiMyXmM7JuE1TmqHrTXM8pZnIzI-q-11W9v7FHYZMpNVwsFZoU3K_tRRQjEiEU4-DA8LjkYzbA3zq2yG5I3EpstBTdrqI85DgsjYacTq4xDsv0FwcEywK7Kl-9_zhE9rxlFJ-Nds3Q1O9XCWLTsynWGrcbn6z32S6GaxjuIcRNTh0kbUEfW1pX9Qu_2hlZEYkTs0nCgwO8rRcObiw0M2CcrGEIegsLVH1TXHonEU21hoCwft0jqBTVqwSRf2I5UfKlEGKbehiMi45t6qsaX3Q2tRCSU5txRsZrUYfUFhjnrPymr_bEVYXWroHu0E-7mDnZq1ZV4n0oTtxaoTAT0sO3O3P953Ja_terRSO86J_X7RA05s3rfsKNKyumXNkzi1rYOmDTQ9wLZ_Tu3sziXNueRwTvU7avKA_bNKwYp0YEmNtIH9QEeHeKlbhH0C3Deesvy472wKFAjbd9XNsK47Zw9xqWf5MaOGvrCf4NgnvcG4ScH1kZ2t8mvQJlIbjcj40L57n69rlf80OfT20XFhOdC2BfDHdEuFAD6VlTCPTDj6_YNyPoKmbw46JAfwOnAsrI-aY_xNEJ1u0rcPAygJUDivhUejCXP3Pnu7nljNGeLRmgsb0AbLEhS1w9DeqnWz_vR1_hytn54RGa8czaFrcRCJFDidlgpMnFJtq_9fnQ6ARO7_sayMO3nwuz0W605nEXn3P4-OCefbZPqbWJxb1kmpQSQr4zQczuqGuNlhon9xh7R3zmlYYia0ocIwO0PoV2bSLU6pBtekQwHC92Id1S9XL-totV5G6_mR9q3f2cMrFEattWJRidTp2b-NIDJ2Cc0ZYFrnFLfidHi0QlpzR2odbj8g1AR8nMVUa1CmTv4yR2R8pGBnaacM1ys1YItz-m7FxKVi4nk58fwjJPbnyFdPKdlM33IaN5W0FhIF9MfJoNKdRhVlRvuGNeVUa_xVyRS0tjK4cCyrhLPUP940lX8rldna-cPd4LuJ3RL0bfWiO-7sSdg8lVFD41YqvjAhd5v1A-a_LmNuMKVSGMoEKNe_r0_6d4utkbFOXWkrLdWN3l7DYYSLaWd7fAS9bysuNyptPu0gE5_vFjru3iHyv3L9T7Q6F7zzTGNz6ur_HJD945G1hWhw_vMxziPP7x8n_WPY09VmPG8v-k7gLKY6f1007R6um8BajHU3yEmr0t0Pr-wPLiEeXjl4d2V_dAnx-MrB-yv7_d4lzP3Tvu0I4Jq2-uFF3INrR68prH93Effo2tFrOuvfX_SP3pWj5JrSCLmI-5pzkWtKI8OLuK_5F7mmNDK-iPuai4XXlBZejMfw1MVO2n__wfXcpS8O_z7juhcWfZo-XH7f94_TPZr6bHOo2-C2mvI9-m45OlyYtIa9_aDXzK37W14UTnfutc3J1pXi9JMn6OfnmntfRc9cf17sKtrjzOQx_svdudm55nSmebG9IyJ3vonsKvCXW4uOztp5vb7vpAZuFRUbO88ffMs3hrXW9s8p04VVABNlZfqIjFuzm1X4Ue_Quq6-jIb8NhpZmdL60WfQtD1kgZeCGUa5PeF4wUmV56DcdXIqIdf4lZktzsCAKphg2pqv1FBlMvBPT3hHedU0fQ2DhZ0JstjC4A0IW8m6kVd7F6ZFyZnx7yoz3L89I2wlWC5VESugPM6YNoolVe3BVvyM2QANGkzuEuLwYU-07k9oZaT1KzfuRo0ljwO0noY9bidB90KyO_OeQ0r-FdKTrjScNhG0T0euV_2EdZ8r_8pjh-B653XLOPgLM-v0fwKUz5UQ7tV77IeY49G7G5czuZLmyb_YgWuZrf5ubcQ1GGqZPnOS_JuT3t_Pnmw0dHtI3WfAp_gM3_UiOVpsguoif1MukwTUIxRSvSMyPucrXt8vYKayKDm8MfO-umjFo6uMKadig7dUYyow07oCH5ZBroDTN8gO3w5r_qBqo_HrFhQc8DCDC_qOhTQ4l5zLV-cdU9-J90d4U7HMPXDrW4y_acAUG-d7zSOiPCAr7YDIBC6prVYG1GHI90-GoE2dqc2rdI82DqJG9UXKcsElNd67W053toocP9rUmFsD3idfbQ71wxcJ_6jqX1Emh6uA48dYN6r5gqMqOCp9F8taI-Qv1LduXQynR4ryNzz7Q0f-Mpmvpn88Rs9_IjK-xMi9u0orS2499XCnWKNqMMSP0XLV9uXmSwo32UOY3Yf39AYe-qPwfjAe3I3GN9uHu3F_PArDdEjhbjRMCRnTrEdGIaEkpzDOb9gD6ZFhj_RDQvr9_vg2SUZZPiT3SXbX792FORr0oKCM33K-K26l2tw4b3_ok1E_HNxwmgDX7suxhAh49bGACEHD2Y16sIeCpNpoNOhx9wLYoDHMcPet2ijyX0UbzrD_WmT9jAgKq_obslLcVIo_bI0pXTvh3HnDzLZKblPpvifEd_tfQankP5AaRBaOGY3IouZ290D-PwAA___bGAaC">