<table border="1" cellspacing="0" cellpadding="8">
    <tr>
        <th>Issue</th>
        <td>
            <a href=https://github.com/llvm/llvm-project/issues/60684>60684</a>
        </td>
    </tr>

    <tr>
        <th>Summary</th>
        <td>
            Clang fails to optimize out unnecessary vsum4ubs instruction with optimization enabled
        </td>
    </tr>

    <tr>
      <th>Labels</th>
      <td>
            new issue
      </td>
    </tr>

    <tr>
      <th>Assignees</th>
      <td>
      </td>
    </tr>

    <tr>
      <th>Reporter</th>
      <td>
          johnplatts
      </td>
    </tr>
</table>

<pre>
    Here is a C++ program that generates an extra vsum4ubs instruction with clang on the powerpc64le-linux-gnu target:
```
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")

#undef vector
#undef pixel
#undef bool

#include <altivec.h>

#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")

#include <stdint.h>

namespace detail {

inline __attribute__((__always_inline__))
__vector unsigned int AltivecVsum4ubs(__vector unsigned char a,
 __vector unsigned int b) {
#ifdef __OPTIMIZE__
 if(__builtin_constant_p(a[0]) && __builtin_constant_p(a[1]) &&
 __builtin_constant_p(a[2]) && __builtin_constant_p(a[3]) &&
 __builtin_constant_p(a[4]) && __builtin_constant_p(a[5]) &&
 __builtin_constant_p(a[6]) && __builtin_constant_p(a[7]) &&
 __builtin_constant_p(a[8]) && __builtin_constant_p(a[9]) &&
 __builtin_constant_p(a[10]) && __builtin_constant_p(a[11]) &&
 __builtin_constant_p(a[12]) && __builtin_constant_p(a[13]) &&
 __builtin_constant_p(a[14]) && __builtin_constant_p(a[15]) &&
 __builtin_constant_p(b[0]) && __builtin_constant_p(b[1]) &&
 __builtin_constant_p(b[2]) && __builtin_constant_p(b[3])) {
    const uint64_t sum0 = static_cast<uint64_t>(a[0]) +
 static_cast<uint64_t>(a[1]) +
 static_cast<uint64_t>(a[2]) +
 static_cast<uint64_t>(a[3]) +
 static_cast<uint64_t>(b[0]);
    const uint64_t sum1 = static_cast<uint64_t>(a[4]) +
 static_cast<uint64_t>(a[5]) +
 static_cast<uint64_t>(a[6]) +
 static_cast<uint64_t>(a[7]) +
 static_cast<uint64_t>(b[1]);
    const uint64_t sum2 = static_cast<uint64_t>(a[8]) +
 static_cast<uint64_t>(a[9]) +
 static_cast<uint64_t>(a[10]) +
 static_cast<uint64_t>(a[11]) +
 static_cast<uint64_t>(b[2]);
    const uint64_t sum3 = static_cast<uint64_t>(a[12]) +
 static_cast<uint64_t>(a[13]) +
 static_cast<uint64_t>(a[14]) +
 static_cast<uint64_t>(a[15]) +
 static_cast<uint64_t>(b[3]);
    typedef uint32_t LoadU32VectType __attribute__((__vector_size__(16)));
    return (__vector unsigned int)((LoadU32VectType){
      static_cast<uint32_t>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
      static_cast<uint32_t>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
      static_cast<uint32_t>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
      static_cast<uint32_t>(sum3 <= 0xFFFFFFFFu ? sum3 : 0xFFFFFFFFu)
      });
  } else
#endif
  {
    return vec_vsum4ubs(a, b);
  }
}

inline __attribute__((__always_inline__))
__vector signed int AltivecVsum2sws(__vector signed int a,
 __vector signed int b) {
#ifdef __OPTIMIZE__
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
  constexpr int kDestLaneOffset = 0;
#else
 constexpr int kDestLaneOffset = 1;
#endif
 if(__builtin_constant_p(a[0]) && __builtin_constant_p(a[1]) &&
 __builtin_constant_p(a[2]) && __builtin_constant_p(a[3]) &&
 __builtin_constant_p(b[kDestLaneOffset]) &&
 __builtin_constant_p(b[kDestLaneOffset + 2])) {
    const int64_t sum0 = static_cast<int64_t>(a[0]) +
 static_cast<int64_t>(a[1]) +
 static_cast<int64_t>(b[kDestLaneOffset]);
    const int64_t sum1 = static_cast<int64_t>(a[2]) +
 static_cast<int64_t>(a[3]) +
 static_cast<int64_t>(b[kDestLaneOffset + 2]);
    const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
    const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
    typedef uint64_t LoadU64VectType __attribute__((__vector_size__(16)));
    return (__vector signed int)((LoadU64VectType){
      (sign0 == (sum0 >> 31)) ? static_cast<uint32_t>(sum0) :
 static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
      (sign1 == (sum1 >> 31)) ? static_cast<uint32_t>(sum1) :
 static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)
      });
  } else
#endif
  {
    __vector signed int sum;
    
    // Inline assembly is used for vsum2sws to avoid unnecessary shuffling
    // on little-endian PowerPC targets as the result of the vsum2sws
    // instruction will already be in the correct lanes on little-endian
    // PowerPC targets.
    __asm__("vsum2sws %0,%1,%2"
            : "=v" (sum)
            : "v" (a), "v" (b));
    
    return sum;
 }
}

}  // namespace detail

inline __vector unsigned int SumsOf8(const __vector unsigned char v) {
    const __vector unsigned int zero_vect = vec_splats(0u);
    __vector unsigned int sum = vec_vsum4ubs(v, zero_vect);
    
    return (__vector unsigned int)
 detail::AltivecVsum2sws((__vector signed int)detail::AltivecVsum4ubs(
        v, zero_vect), (__vector signed int)zero_vect);
}

__vector unsigned int SumsOf8Test_1(const __vector unsigned char v) {
    return SumsOf8(v);
}

__vector unsigned int SumsOf8Test_2() {
    const __vector unsigned char vals =
 {1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 121, 98, 219};
 return SumsOf8(vals);
}
```

Here is the assembly code that is generated when the above C++ code is compiled with clang++ 15 with the `--target=powerpc64le-linux-gnu -mcpu=power6 --std=c++11 -maltivec -O2` options:
```
        .text
        .abiversion 2
        .file   "vsum4ubs_snippet_021123.cpp"
        .globl  _Z13SumsOf8Test_1Dv16_h # -- Begin function _Z13SumsOf8Test_1Dv16_h
        .p2align        4
        .type   _Z13SumsOf8Test_1Dv16_h,@function
_Z13SumsOf8Test_1Dv16_h: # @_Z13SumsOf8Test_1Dv16_h
.Lfunc_begin0:
        .cfi_startproc
# %bb.0:
        vxor 3, 3, 3
        vsum4ubs 4, 2, 3
        vsum4ubs 2, 2, 3
        #APP
        vsum2sws 2, 2, 3
        #NO_APP
        blr
        .long   0
        .quad   0
.Lfunc_end0:
        .size   _Z13SumsOf8Test_1Dv16_h, .Lfunc_end0-.Lfunc_begin0
        .cfi_endproc
 # -- End function
        .section        .rodata.cst16,"aM",@progbits,16
        .p2align        4 # -- Begin function _Z13SumsOf8Test_2v
.LCPI1_0:
        .byte   1                               # 0x1
        .byte   1                               # 0x1
        .byte   2 # 0x2
        .byte   3                               # 0x3
        .byte   5                               # 0x5
        .byte   8 # 0x8
        .byte   13                              # 0xd
        .byte   21                              # 0x15
        .byte   34 # 0x22
        .byte   55                              # 0x37
        .byte   89                              # 0x59
        .byte   144 # 0x90
        .byte   233                             # 0xe9
        .byte   121                             # 0x79
        .byte   98 # 0x62
        .byte   219                             # 0xdb
.LCPI1_1:
        .long   54                              # 0x36
        .long   0                               # 0x0
        .long   993 # 0x3e1
        .long   0                               # 0x0
        .text
        .globl  _Z13SumsOf8Test_2v
        .p2align        4
        .type   _Z13SumsOf8Test_2v,@function
_Z13SumsOf8Test_2v: # @_Z13SumsOf8Test_2v
.Lfunc_begin1:
        .cfi_startproc
.Lfunc_gep1:
        addis 2, 12, .TOC.-.Lfunc_gep1@ha
        addi 2, 2, .TOC.-.Lfunc_gep1@l
.Lfunc_lep1:
        .localentry     _Z13SumsOf8Test_2v, .Lfunc_lep1-.Lfunc_gep1
# %bb.0:
        addis 3, 2, .LCPI1_1@toc@ha
        vxor 3, 3, 3
        addi 3, 3, .LCPI1_1@toc@l
        lvx 2, 0, 3
        addis 3, 2, .LCPI1_0@toc@ha
        addi 3, 3, .LCPI1_0@toc@l
        lvx 4, 0, 3
        vsum4ubs 3, 4, 3
        blr
        .long   0
        .quad   0
.Lfunc_end1:
        .size   _Z13SumsOf8Test_2v, .Lfunc_end1-.Lfunc_begin1
        .cfi_endproc
 # -- End function
        .ident  "Ubuntu clang version 15.0.6"
        .section        ".note.GNU-stack","",@progbits
        .addrsig
```

Here is the assembly code that is generated when the above C++ code is compiled with g++ 12 with the `-mcpu=power6 --std=c++11 -maltivec -O2` options:
```
        .file   "vsum4ubs_snippet_021123.cpp"
        .machine power7
        .abiversion 2
        .section        ".text"
        .align 2
        .p2align 4,,15
        .globl _Z13SumsOf8Test_1Dv16_h
        .type   _Z13SumsOf8Test_1Dv16_h, @function
_Z13SumsOf8Test_1Dv16_h:
.LFB3:
        .cfi_startproc
        vxor 0,0,0
        vsum4ubs 2,2,0
#APP
 # 104 "vsum4ubs_snippet_021123.cpp" 1
        vsum2sws 2,2,0
 # 0 "" 2
#NO_APP
        blr
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .cfi_endproc
.LFE3:
        .size   _Z13SumsOf8Test_1Dv16_h,.-_Z13SumsOf8Test_1Dv16_h
        .align 2
        .p2align 4,,15
        .globl _Z13SumsOf8Test_2v
        .type   _Z13SumsOf8Test_2v, @function
_Z13SumsOf8Test_2v:
.LFB4:
        .cfi_startproc
.LCF1:
0:      addis 2,12,.TOC.-.LCF1@ha
        addi 2,2,.TOC.-.LCF1@l
        .localentry     _Z13SumsOf8Test_2v,.-_Z13SumsOf8Test_2v
        addis 9,2,.LC0@toc@ha
        addi 9,9,.LC0@toc@l
        lvx 2,0,9
        blr
        .long 0
        .byte 0,9,0,0,0,0,0,0
        .cfi_endproc
.LFE4:
        .size   _Z13SumsOf8Test_2v,.-_Z13SumsOf8Test_2v
        .section        .rodata.cst16,"aM",@progbits,16
        .align 4
.LC0:
        .long   54
        .long   0
        .long   993
        .long   0
        .ident  "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0"
        .section        .note.GNU-stack,"",@progbits
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzcWl1zozjW_jXKjcoUEhibi1wkdjJvqjKd1LyZqdq5oQTItmZkwSLhJHOxv31L4sNgwECnZ7dqu7rdMTrn6HzpPA8QIiXbC0pvwfIeLLc3JFeHJLv9IzmIlBOl5E2YxJ-3_0czCpmEBG4Avgf4HqZZss_IEaoDUXBPBc2IohISAemHygg8yfzo5qGETEiV5ZFiiYDvTB1gxInYw0RAdaAwTd5plkaey-mCM5F_LPYih4pke6qAcwfsLbDvgGeXf4uv2Ekzsj8SmObyEBxJlCUArwHGJxqpJAMYA-yPyabsg_JpomGStCUr-VzEdAfLXdsXC_Pta8ZO2wITEc9jCoGzIVyxE42sA3AeLqQqv5J0crRt0WvBtiUHY214KlXMhOo4KsiRypREFMZUEcYhWN0315ngTFAYBESpjIW5okFgNl0HAeHv5FMGhYi-7NceBEERKMyF6dUYMqHgXZGt38o2M0Yu5aIDySABeFMYgv2WQoD9hqvYYTtdrCB4eX17-vnp94cgKPXZzmwT5owrJoIoEVIRoYIU4DUBy3sbLLfGGPYA9uAVSdSSrN0blMeTLTszLbuTLS9nWvYmW17NtLyebNmfaRnNKOHcGqLpRURzq4imlxHNqWM4uavDmV0dTu7q8NzVrZMKIYRGEOZMKM8NFJT50YbA2UKpiGJREBGpgLOp1vW8ujyplbExDTRbA8_WcGZpNGoDnKs5QZNy4s72dzlbw5utsZqdEzQpJ3hSTtaz_fXnd9Z3NOO8bmyctZGsOJOygua3NprX2xczbbLKvI5sTpZmXtRnSjUF0NIODhR8Tkj8q4N_o5F6-0wHKEzBLQLJ_iouIq9kMRfWM6ryTMBexsKEMhra4sWm-npz9MGe0LSzRWjlJNzoctofj-WfHALnsZqSd80Fs-tmhnV0xTr6snV8xTr-snXninWnz3rDNFhdtAtYbSHlktb0kYqY7c6r3cqfaBSczrxVE1RDQttGS3v1Dz-EQvcTaCzfWwS6IdXHnr-HO5slGAT3_3h7CF5-2T78EgR61ugqBOWF56e3t-eH4OHb9unuW026iylFP9LM7Pjnlkr1TAR92e0kVWZe2XXmdPrrWkzQRC3NRuH-16i-HnUXCfiath6xEF-lZSOsbDYpm83JOsO-NwM9kDjCnWZTvdlMb8zzVvZ7_Te4pU_qQOo7UPEAnAfoOeMGBzLSQYcBg01wNUk2OOe5fxu4DkHrecsutOooqtzpaC_S5KCq6zVojMBwIXc3yEca0sWWywdof6xK_OmCWymI2r6h7_ENzfMN9fj2g6CxD1xkfmzVtvETfgT4ET4VaEikpMeQf0ImYS5pDHdJZp4_alyDKoHklLAY5kLQiEpJsk8oD_lux5nYd2wmAnKmFKcL7SwR8DV5p9nrpnwmKSGR5sFlRmXOFUx25lu1W8dc--kn55DwjJL4E4YUsuIRaJRkGY0U5ERQ2dm_Y_HCH6uZQyKP5ZHBdfwAL23dQXiJiv8wwLhZtrJ4zh3UC872BDAum-qiwC3JSowUHdq8FPYczM4RbVV3iO_oDqoCv3y82MuL-h7w_X9-lC87fe6LYTbwoPA0AGP9Rv-iWWIWzDTUlE6mnCjNo-z8Mvh-EzI_1soNPnjS2azNj-bx2n1EIVumy7kDzl2X9Q2OygG10st2X3R9Nh0xYLk3uIvKXy3lG5UqQPMLWqbs3BCnL3mATfomNk3hEuHSzO2q7e_1oYTmA-sPR38s9cfaLJjv2Kw7rlkrFk2CkWsuYcdIoULMXxcqvg6n7pxu5ITL_uAvXraYz-r9j55X9biNkpgW73-YrF8BxfD9QIvBRsLkROsXRkaaSRglx5RxLVe_DCol0LK4pnWBZy8W1Wugbf9bosUxSvNq1YOLhVQxcLZRYQ4huDiW71Tg4gUDz4ZJqgexHHyxZPuWoh_q_I2E7EQzqac3Pl_dMU6B7ZdTVh-IQAqWplQFNkYIO1aUpvWU1Rp7noQc2H7wO3JaPbw9IS84QIAduFjAe7pnAu5yUQDGgPTZbIoJZ3sBbN9tRKAZzeBWGgNcu9qi7PQBUTPoHQhc-6on1rM2F4TaefucW9u3oh0LpCKZSrMkqlmAhqQwtFqip48kK5q__KgXqveIbuOMdBZxzyLAzt3ra0vWAOKA7LeXoCke8uwcB080U_AbbfLPnMTnK2UGqIjb8Wu6erUUsKG5aOexlUQq4nMKq255EDFsF9LsScsLvpUlMVHEiqTSdHkDMCY_m3d7ugXSLNmHTAPWBnm9LTWxK_GpSsLm9QkF7QyEn0pnAMHrf_RO9gf6YXq4XMGdFWeSRaejt5ykt-zorcuVdTe2EVcKvbgb20hSypx0XXHcKivdtCxH4ivTsurG50_Ki98N363caZ6rKkbnenIKRdpjdSQ7heKqq1gXyusmB6PrQZaVClvnALXOQTlBlu6UbDled_Rc1ysV7Y6e7zuVUYq-brWNjkOoVk2EuRClbyDG0QmfrgDTeRidZykaw6RSeE_TliiJY1bCBTKf1tvLxlo0pV37QFryDXTpk-atDfnFhhZPIsKpUNnnUHZgQ7Vl-xq4FoE4Z8-qDnVtlUTtIAaB2ER3XujYqO_HfH76KPaxuxZ63LD73OjfzR7Yze3ZreYGxoDbXvxOdEdT0L1VJ620aDfj9wI7i6lQBe38NcyFysvfpKoIKlpatuW1aOeZCwCMLZEoav307deFVCT6syQCGHcZwZn-xnEm2f6_cWNQ3xTg9k3B38H55_P5I4kOTJS_vrYau19ol8HM0KYxMyCb8uXMNE2r6dnyYuSO3xiM3gPAGTcB1Sl4vHdG2X05QPRZLP71UnXcWGqwdNP_yHbhaCUg6uf1DcMFeMGiwevsjvN8eMFGTCx-Hc_lv-HTbD0_PjjzbgesxWhlv94sTXS-BsTjLWKQ-Nwd7jjObh7PI7SAqAbKGpCtUFNL9qNrV4rPQdBujhsJKZzxq12eN8Pg5Jdt0RLqgmDRPf-ZbnMnw9PVJHz1FrLqx6rmdj8TvgbAZ_Z6TaqBiD9tNgUrXJfYiLCFLHuBc_MV_Qtjy3YB9suFAZjsYOQ1gKyA5Ca-dWLf8ckNvUXeykOev8L-zeHWt92d46_9MMahi3zkIGIvd34UI7KyvTC-YbfYxo6NEbaxi23f8u0dwbsVRjsaYT-iwLXpkTBucX46Wkm2v2FS5vTWs721e8NJSLk0v7qNsaDv0Cxqd5fbm-xW6yzCfC91YzKp5NmKYorT243hDzvCuHlNopHyyP6iMMlV623J8O9ylyrEXKGChJzGN3nGbw9KpQZzzbP7PVOHPLSi5Ajwo3ai_G-RZskf5jnwo3FdAvxoQvt3AAAA___qGxcj">