<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/128424>128424</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            [clang] Vectorizer fails to vectorize average round to single average round instruction

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            clang

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          johnplatts

      </td>

    </tr>

</table>

<pre>

    Here is a snippet of C++ code that Clang vectorizes to suboptimal code:

```

#include <stddef.h>

#include <stdint.h>

#include <array>

#if defined(__GNUC__) || defined(__clang__)

#pragma push_macro("vector")

#pragma push_macro("pixel")

#pragma push_macro("bool")

#undef vector

#undef pixel

#undef bool

#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)

#include <emmintrin.h>

#define HAVE_SIMD_AVERAGE_ROUND 1

#elif (defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)

#include <altivec.h>

#elif defined(__s390x__) && defined(__VEC__)

#include <vecintrin.h>

#define HAVE_SIMD_AVERAGE_ROUND 1

#elif (defined(__aarch64__) || defined(__ARM_NEON))

#include <arm_neon.h>

#define HAVE_SIMD_AVERAGE_ROUND 1

#else

#define HAVE_SIMD_AVERAGE_ROUND 0

#endif

#pragma pop_macro("vector")

#pragma pop_macro("pixel")

#pragma pop_macro("bool")

#else

#define HAVE_SIMD_AVERAGE_ROUND 0

#endif

std::array<uint8_t, 16> AverageRoundU8_1(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {

 std::array<uint8_t, 16> result;

  for (size_t i = 0; i < 16; i++) {

 result[i] = static_cast<uint8_t>((static_cast<uint16_t>(a[i]) +

 static_cast<uint16_t>(b[i]) + uint16_t{1}) >> 1);

  }

  return result;

}

std::array<uint8_t, 16> AverageRoundU8_2(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {

  std::array<uint8_t, 16> result;

  for (size_t i = 0; i < 16; i++) {

    const uint8_t a_i = a[i];

    const uint8_t b_i = b[i];

    result[i] =

 static_cast<uint8_t>(static_cast<unsigned>(a_i) >> 1) +

 static_cast<uint8_t>(static_cast<unsigned>(b_i) >> 1) +

 static_cast<uint8_t>((a_i | b_i) & 1u);

  }

  return result;

}

#if HAVE_SIMD_AVERAGE_ROUND

std::array<uint8_t, 16> AverageRoundU8_3(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {

 typedef uint8_t GccU8x16Vec __attribute__((__vector_size__(16), __aligned__(1), __may_alias__));

  GccU8x16Vec vec_a = *reinterpret_cast<const GccU8x16Vec*>(a.data());

  GccU8x16Vec vec_b = *reinterpret_cast<const GccU8x16Vec*>(b.data());

#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)

  GccU8x16Vec result_vec =

 reinterpret_cast<GccU8x16Vec>(

 _mm_avg_epu8(reinterpret_cast<__m128i>(vec_a), reinterpret_cast<__m128i>(vec_b)));

#elif ((defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)) || \

 (defined(__s390x__) && defined(__VEC__))

  GccU8x16Vec result_vec =

 reinterpret_cast<GccU8x16Vec>(

      vec_avg(reinterpret_cast<__vector unsigned char>(vec_a),

              reinterpret_cast<__vector unsigned char>(vec_b)));

#elif (defined(__aarch64__) || defined(__ARM_NEON))

 GccU8x16Vec result_vec =

    reinterpret_cast<GccU8x16Vec>(

 vrhaddq_u8(reinterpret_cast<uint8x16_t>(vec_a),

 reinterpret_cast<uint8x16_t>(vec_b)));

#endif

  std::array<uint8_t, 16> result;

  *reinterpret_cast<GccU8x16Vec*>(result.data()) = result_vec;

 return result;

}

#endif

```

GCC 14 does a much better job of vectorizing AverageRoundU8_1 than Clang 19 (or Clang trunk) does, and the compilation of the above snippet with various GCC and Clang releases can be found over in Compiler Explorer at https://godbolt.org/z/hanoerdPP.

A related issue regarding suboptimal LLVM codegen has been reported over at https://github.com/llvm/llvm-project/issues/128377.

</pre>

<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJy8WF1z2rwS_jXiZucwtgzGXHBhCOTtTJt20pPcemR7weqxJR9Jpkl__TuSDTHBtDT9YDID0X7o2WdXqw-mNd8JxAWZLsn0ZsQaU0i1-CILUZfMGD1KZf68-AcVAtfAQAte12hAbmFF6JLQJWQyRzAFM7AqmdjBHjMjFf-GGowE3aSyNrxipVMkQUy8mIRe9-fFhAZcZGWTI5BgpU2e43ZckGA9JOPCDMuYUuy5E7SyLeS45QJzQqMkub17WCUJoXMgsxWZrU6EmcXtpEfzWrFdxaBudJFULFOS0IhQ2sZGKO10LyrW_AnLK_RSKXtqTrMROW47FvsjrcvegLPtxUto1I-KB1F4MeSnKEzCSRuz1aAhoeGJxufPa3rk5IRrrCoujOKil4rWEv6JH9fJ53cfbpL4cX0f366T-48Pdzfgt1pYnsOs5VdUdXYRaSf_Ptj4_X_fPa5Xw3hZafgesx5ah6Nvr4O599RBOPd-0fMes9_IBGMqK7o4h5iI7z8kd-uPdy0NA0ugSgTKt2DReJV-t1xR5Hx7tlJkfd1COdW7uE5O1QaWya-g9mJtctuKgrhrHKuGCxMlhtAV-CEJ1hDvUbEd3stG5A9R4hMa_diI2Z8_VkvbDC-JF1-hrVA3pSGBU4etVLZwNP-GiQEOJLAhBkv3szVZAm97c2-azsl0ycn0xhlpwwzPkoxp05s1WDvGo3OpHx7ErHPTLpcuisvq6ak6HKWzpU9m7XCwtpH6NsFdnFZivxWaRokTElrRGxNJ_1Qi_1YmASCTQhvo3ANLWttjWoJhvbTTS8_0zopjOKXH-nglEu4QkXfFkfBXGb1cI1c5TN_isAViuygc7GkIfvOzBeY21gud5S3VF_yp6jPPNdpDwSHZt1n2ED354SNmkCTMGMXTxqDdXCK3m7QdOnHVZwf90O0rK6tdOvrb4eNoxZ6thOnDRtxxZTnsz7bHLGGu0AiNFXJhUNUKzSFPbVH2LAiNu9IZ58wwB3DeS9Rr5-mbnKdDzv_C2ekUf1tllvzDMhsIog-_rWYvhqSqErbfJVg3NoEDZklS-TTirYnLQpe7a3TTjpYjMy9nlD9-YHvxQ6YrG-urGa88nf1Wut3HkbjfXaK7XUJwaFeQFUydsX90dvi8wdV3kvNLB8gfcDUMdpCuvSpYnv8_uVSbri09vZwJXhF0pcUQDy_Hup_fgYd7yFD3aO1OW4hrQy-sdU4vbyc9sP37rxffrlbgTyCXaK_YVZMVkKIxqOCLTO1V-3Cl5mJ3di61V2_RXb19u0Aiqbp_jWrE_yxQ69lywEQOpkDIZFXzkhkuhXVvh1gq93i83n_lpoA9U1w2Giw8a9k6VVgi06ghYwJShK0FAnKPCriAlfOMCtZPdSkVKmAGCmNqbRNDN4RudjJPZWnGUu0I3XwjdFMwIVHlnz6NWz4siSUzmAPXukFQuGMqt8H3XhPev3_84J4UdiigYBpSREt7LZW1dIDO5-amaNJxJitCN2W5P3z9p1byC2aG0I2bUhO68WkUzGbjUb4I8nkwZyNc-LOJF9CZPwtGxWK2nSAGc3_isyAIgjCd5vl87m8ZDWY5DcMRX1CPTj1KAz_06TQY49b3aEq3PqaTdOoxMvGwYrwcWwiWj5GbfOHTaEIno5KlWGr3NEOpe6OwV6DpzUgtHOa02Wky8UqujX5xYbgp3XtOazG9gcfDg4yCLeOle5Y5PtIAa-sJlMujkaC52JWvx7nQRjWZrZhRo8rFL9Fqg9sv6L8BAAD__xUmkX0">