<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/128424>128424</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            [clang] Vectorizer fails to vectorize average round to single average round instruction
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            clang
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          johnplatts
      </td>
    </tr>
</table>

<pre>
    Here is a snippet of C++ code that Clang vectorizes to suboptimal code:
```
#include <stddef.h>
#include <stdint.h>
#include <array>

#if defined(__GNUC__) || defined(__clang__)

#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)
#include <emmintrin.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#elif (defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)
#include <altivec.h>
#elif defined(__s390x__) && defined(__VEC__)
#include <vecintrin.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#elif (defined(__aarch64__) || defined(__ARM_NEON))
#include <arm_neon.h>
#define HAVE_SIMD_AVERAGE_ROUND 1
#else
#define HAVE_SIMD_AVERAGE_ROUND 0
#endif

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

#else
#define HAVE_SIMD_AVERAGE_ROUND 0
#endif


std::array<uint8_t, 16> AverageRoundU8_1(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
 std::array<uint8_t, 16> result;
  for (size_t i = 0; i < 16; i++) {
 result[i] = static_cast<uint8_t>((static_cast<uint16_t>(a[i]) +
 static_cast<uint16_t>(b[i]) + uint16_t{1}) >> 1);
  }
  return result;
}


std::array<uint8_t, 16> AverageRoundU8_2(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
  std::array<uint8_t, 16> result;
  for (size_t i = 0; i < 16; i++) {
    const uint8_t a_i = a[i];
    const uint8_t b_i = b[i];
    result[i] =
 static_cast<uint8_t>(static_cast<unsigned>(a_i) >> 1) +
 static_cast<uint8_t>(static_cast<unsigned>(b_i) >> 1) +
 static_cast<uint8_t>((a_i | b_i) & 1u);
  }
  return result;
}

#if HAVE_SIMD_AVERAGE_ROUND
std::array<uint8_t, 16> AverageRoundU8_3(std::array<uint8_t, 16> a, std::array<uint8_t, 16> b) {
 typedef uint8_t GccU8x16Vec __attribute__((__vector_size__(16), __aligned__(1), __may_alias__));

  GccU8x16Vec vec_a = *reinterpret_cast<const GccU8x16Vec*>(a.data());
  GccU8x16Vec vec_b = *reinterpret_cast<const GccU8x16Vec*>(b.data());

#if (defined(__i386__) || defined(__x86_64__)) && defined(__SSE2__)
  GccU8x16Vec result_vec =
 reinterpret_cast<GccU8x16Vec>(
 _mm_avg_epu8(reinterpret_cast<__m128i>(vec_a), reinterpret_cast<__m128i>(vec_b)));
#elif ((defined(__powerpc__) || defined(__powerpc64__)) && defined(__ALTIVEC__)) || \
 (defined(__s390x__) && defined(__VEC__))
  GccU8x16Vec result_vec =
 reinterpret_cast<GccU8x16Vec>(
      vec_avg(reinterpret_cast<__vector unsigned char>(vec_a),
              reinterpret_cast<__vector unsigned char>(vec_b)));
#elif (defined(__aarch64__) || defined(__ARM_NEON))
 GccU8x16Vec result_vec =
    reinterpret_cast<GccU8x16Vec>(
 vrhaddq_u8(reinterpret_cast<uint8x16_t>(vec_a),
 reinterpret_cast<uint8x16_t>(vec_b)));
#endif

  std::array<uint8_t, 16> result;
  *reinterpret_cast<GccU8x16Vec*>(result.data()) = result_vec;
 return result;
}
#endif
```

GCC 14 does a much better job of vectorizing AverageRoundU8_1 than Clang 19 (or Clang trunk) does, and the compilation of the above snippet with various GCC and Clang releases can be found over in Compiler Explorer at https://godbolt.org/z/hanoerdPP.

A related issue regarding suboptimal LLVM codegen has been reported over at https://github.com/llvm/llvm-project/issues/128377.
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJy8WF1z2rwS_jXiZucwtgzGXHBhCOTtTJt20pPcemR7weqxJR9Jpkl__TuSDTHBtDT9YDID0X7o2WdXqw-mNd8JxAWZLsn0ZsQaU0i1-CILUZfMGD1KZf68-AcVAtfAQAte12hAbmFF6JLQJWQyRzAFM7AqmdjBHjMjFf-GGowE3aSyNrxipVMkQUy8mIRe9-fFhAZcZGWTI5BgpU2e43ZckGA9JOPCDMuYUuy5E7SyLeS45QJzQqMkub17WCUJoXMgsxWZrU6EmcXtpEfzWrFdxaBudJFULFOS0IhQ2sZGKO10LyrW_AnLK_RSKXtqTrMROW47FvsjrcvegLPtxUto1I-KB1F4MeSnKEzCSRuz1aAhoeGJxufPa3rk5IRrrCoujOKil4rWEv6JH9fJ53cfbpL4cX0f366T-48Pdzfgt1pYnsOs5VdUdXYRaSf_Ptj4_X_fPa5Xw3hZafgesx5ah6Nvr4O599RBOPd-0fMes9_IBGMqK7o4h5iI7z8kd-uPdy0NA0ugSgTKt2DReJV-t1xR5Hx7tlJkfd1COdW7uE5O1QaWya-g9mJtctuKgrhrHKuGCxMlhtAV-CEJ1hDvUbEd3stG5A9R4hMa_diI2Z8_VkvbDC-JF1-hrVA3pSGBU4etVLZwNP-GiQEOJLAhBkv3szVZAm97c2-azsl0ycn0xhlpwwzPkoxp05s1WDvGo3OpHx7ErHPTLpcuisvq6ak6HKWzpU9m7XCwtpH6NsFdnFZivxWaRokTElrRGxNJ_1Qi_1YmASCTQhvo3ANLWttjWoJhvbTTS8_0zopjOKXH-nglEu4QkXfFkfBXGb1cI1c5TN_isAViuygc7GkIfvOzBeY21gud5S3VF_yp6jPPNdpDwSHZt1n2ED354SNmkCTMGMXTxqDdXCK3m7QdOnHVZwf90O0rK6tdOvrb4eNoxZ6thOnDRtxxZTnsz7bHLGGu0AiNFXJhUNUKzSFPbVH2LAiNu9IZ58wwB3DeS9Rr5-mbnKdDzv_C2ekUf1tllvzDMhsIog-_rWYvhqSqErbfJVg3NoEDZklS-TTirYnLQpe7a3TTjpYjMy9nlD9-YHvxQ6YrG-urGa88nf1Wut3HkbjfXaK7XUJwaFeQFUydsX90dvi8wdV3kvNLB8gfcDUMdpCuvSpYnv8_uVSbri09vZwJXhF0pcUQDy_Hup_fgYd7yFD3aO1OW4hrQy-sdU4vbyc9sP37rxffrlbgTyCXaK_YVZMVkKIxqOCLTO1V-3Cl5mJ3di61V2_RXb19u0Aiqbp_jWrE_yxQ69lywEQOpkDIZFXzkhkuhXVvh1gq93i83n_lpoA9U1w2Giw8a9k6VVgi06ghYwJShK0FAnKPCriAlfOMCtZPdSkVKmAGCmNqbRNDN4RudjJPZWnGUu0I3XwjdFMwIVHlnz6NWz4siSUzmAPXukFQuGMqt8H3XhPev3_84J4UdiigYBpSREt7LZW1dIDO5-amaNJxJitCN2W5P3z9p1byC2aG0I2bUhO68WkUzGbjUb4I8nkwZyNc-LOJF9CZPwtGxWK2nSAGc3_isyAIgjCd5vl87m8ZDWY5DcMRX1CPTj1KAz_06TQY49b3aEq3PqaTdOoxMvGwYrwcWwiWj5GbfOHTaEIno5KlWGr3NEOpe6OwV6DpzUgtHOa02Wky8UqujX5xYbgp3XtOazG9gcfDg4yCLeOle5Y5PtIAa-sJlMujkaC52JWvx7nQRjWZrZhRo8rFL9Fqg9sv6L8BAAD__xUmkX0">