<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/60684>60684</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
Clang fails to optimize out unnecessary vsum4ubs instruction with optimization enabled
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
johnplatts
</td>
</tr>
</table>
<pre>
Here is a C++ program that generates an extra vsum4ubs instruction with clang on the powerpc64le-linux-gnu target:
```
#pragma push_macro("vector")
#pragma push_macro("pixel")
#pragma push_macro("bool")
#undef vector
#undef pixel
#undef bool
#include <altivec.h>
#pragma pop_macro("vector")
#pragma pop_macro("pixel")
#pragma pop_macro("bool")
#include <stdint.h>
namespace detail {
inline __attribute__((__always_inline__))
__vector unsigned int AltivecVsum4ubs(__vector unsigned char a,
__vector unsigned int b) {
#ifdef __OPTIMIZE__
if(__builtin_constant_p(a[0]) && __builtin_constant_p(a[1]) &&
__builtin_constant_p(a[2]) && __builtin_constant_p(a[3]) &&
__builtin_constant_p(a[4]) && __builtin_constant_p(a[5]) &&
__builtin_constant_p(a[6]) && __builtin_constant_p(a[7]) &&
__builtin_constant_p(a[8]) && __builtin_constant_p(a[9]) &&
__builtin_constant_p(a[10]) && __builtin_constant_p(a[11]) &&
__builtin_constant_p(a[12]) && __builtin_constant_p(a[13]) &&
__builtin_constant_p(a[14]) && __builtin_constant_p(a[15]) &&
__builtin_constant_p(b[0]) && __builtin_constant_p(b[1]) &&
__builtin_constant_p(b[2]) && __builtin_constant_p(b[3])) {
const uint64_t sum0 = static_cast<uint64_t>(a[0]) +
static_cast<uint64_t>(a[1]) +
static_cast<uint64_t>(a[2]) +
static_cast<uint64_t>(a[3]) +
static_cast<uint64_t>(b[0]);
const uint64_t sum1 = static_cast<uint64_t>(a[4]) +
static_cast<uint64_t>(a[5]) +
static_cast<uint64_t>(a[6]) +
static_cast<uint64_t>(a[7]) +
static_cast<uint64_t>(b[1]);
const uint64_t sum2 = static_cast<uint64_t>(a[8]) +
static_cast<uint64_t>(a[9]) +
static_cast<uint64_t>(a[10]) +
static_cast<uint64_t>(a[11]) +
static_cast<uint64_t>(b[2]);
const uint64_t sum3 = static_cast<uint64_t>(a[12]) +
static_cast<uint64_t>(a[13]) +
static_cast<uint64_t>(a[14]) +
static_cast<uint64_t>(a[15]) +
static_cast<uint64_t>(b[3]);
typedef uint32_t LoadU32VectType __attribute__((__vector_size__(16)));
return (__vector unsigned int)((LoadU32VectType){
static_cast<uint32_t>(sum0 <= 0xFFFFFFFFu ? sum0 : 0xFFFFFFFFu),
static_cast<uint32_t>(sum1 <= 0xFFFFFFFFu ? sum1 : 0xFFFFFFFFu),
static_cast<uint32_t>(sum2 <= 0xFFFFFFFFu ? sum2 : 0xFFFFFFFFu),
static_cast<uint32_t>(sum3 <= 0xFFFFFFFFu ? sum3 : 0xFFFFFFFFu)
});
} else
#endif
{
return vec_vsum4ubs(a, b);
}
}
inline __attribute__((__always_inline__))
__vector signed int AltivecVsum2sws(__vector signed int a,
__vector signed int b) {
#ifdef __OPTIMIZE__
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
constexpr int kDestLaneOffset = 0;
#else
constexpr int kDestLaneOffset = 1;
#endif
if(__builtin_constant_p(a[0]) && __builtin_constant_p(a[1]) &&
__builtin_constant_p(a[2]) && __builtin_constant_p(a[3]) &&
__builtin_constant_p(b[kDestLaneOffset]) &&
__builtin_constant_p(b[kDestLaneOffset + 2])) {
const int64_t sum0 = static_cast<int64_t>(a[0]) +
static_cast<int64_t>(a[1]) +
static_cast<int64_t>(b[kDestLaneOffset]);
const int64_t sum1 = static_cast<int64_t>(a[2]) +
static_cast<int64_t>(a[3]) +
static_cast<int64_t>(b[kDestLaneOffset + 2]);
const int32_t sign0 = static_cast<int32_t>(sum0 >> 63);
const int32_t sign1 = static_cast<int32_t>(sum1 >> 63);
typedef uint64_t LoadU64VectType __attribute__((__vector_size__(16)));
return (__vector signed int)((LoadU64VectType){
(sign0 == (sum0 >> 31)) ? static_cast<uint32_t>(sum0) :
static_cast<uint32_t>(sign0 ^ 0x7FFFFFFF),
(sign1 == (sum1 >> 31)) ? static_cast<uint32_t>(sum1) :
static_cast<uint32_t>(sign1 ^ 0x7FFFFFFF)
});
} else
#endif
{
__vector signed int sum;
// Inline assembly is used for vsum2sws to avoid unnecessary shuffling
// on little-endian PowerPC targets as the result of the vsum2sws
// instruction will already be in the correct lanes on little-endian
// PowerPC targets.
__asm__("vsum2sws %0,%1,%2"
: "=v" (sum)
: "v" (a), "v" (b));
return sum;
}
}
} // namespace detail
inline __vector unsigned int SumsOf8(const __vector unsigned char v) {
const __vector unsigned int zero_vect = vec_splats(0u);
__vector unsigned int sum = vec_vsum4ubs(v, zero_vect);
return (__vector unsigned int)
detail::AltivecVsum2sws((__vector signed int)detail::AltivecVsum4ubs(
v, zero_vect), (__vector signed int)zero_vect);
}
__vector unsigned int SumsOf8Test_1(const __vector unsigned char v) {
return SumsOf8(v);
}
__vector unsigned int SumsOf8Test_2() {
const __vector unsigned char vals =
{1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 121, 98, 219};
return SumsOf8(vals);
}
```
Here is the assembly code that is generated when the above C++ code is compiled with clang++ 15 with the `--target=powerpc64le-linux-gnu -mcpu=power6 --std=c++11 -maltivec -O2` options:
```
.text
.abiversion 2
.file "vsum4ubs_snippet_021123.cpp"
.globl _Z13SumsOf8Test_1Dv16_h # -- Begin function _Z13SumsOf8Test_1Dv16_h
.p2align 4
.type _Z13SumsOf8Test_1Dv16_h,@function
_Z13SumsOf8Test_1Dv16_h: # @_Z13SumsOf8Test_1Dv16_h
.Lfunc_begin0:
.cfi_startproc
# %bb.0:
vxor 3, 3, 3
vsum4ubs 4, 2, 3
vsum4ubs 2, 2, 3
#APP
vsum2sws 2, 2, 3
#NO_APP
blr
.long 0
.quad 0
.Lfunc_end0:
.size _Z13SumsOf8Test_1Dv16_h, .Lfunc_end0-.Lfunc_begin0
.cfi_endproc
# -- End function
.section .rodata.cst16,"aM",@progbits,16
.p2align 4 # -- Begin function _Z13SumsOf8Test_2v
.LCPI1_0:
.byte 1 # 0x1
.byte 1 # 0x1
.byte 2 # 0x2
.byte 3 # 0x3
.byte 5 # 0x5
.byte 8 # 0x8
.byte 13 # 0xd
.byte 21 # 0x15
.byte 34 # 0x22
.byte 55 # 0x37
.byte 89 # 0x59
.byte 144 # 0x90
.byte 233 # 0xe9
.byte 121 # 0x79
.byte 98 # 0x62
.byte 219 # 0xdb
.LCPI1_1:
.long 54 # 0x36
.long 0 # 0x0
.long 993 # 0x3e1
.long 0 # 0x0
.text
.globl _Z13SumsOf8Test_2v
.p2align 4
.type _Z13SumsOf8Test_2v,@function
_Z13SumsOf8Test_2v: # @_Z13SumsOf8Test_2v
.Lfunc_begin1:
.cfi_startproc
.Lfunc_gep1:
addis 2, 12, .TOC.-.Lfunc_gep1@ha
addi 2, 2, .TOC.-.Lfunc_gep1@l
.Lfunc_lep1:
.localentry _Z13SumsOf8Test_2v, .Lfunc_lep1-.Lfunc_gep1
# %bb.0:
addis 3, 2, .LCPI1_1@toc@ha
vxor 3, 3, 3
addi 3, 3, .LCPI1_1@toc@l
lvx 2, 0, 3
addis 3, 2, .LCPI1_0@toc@ha
addi 3, 3, .LCPI1_0@toc@l
lvx 4, 0, 3
vsum4ubs 3, 4, 3
blr
.long 0
.quad 0
.Lfunc_end1:
.size _Z13SumsOf8Test_2v, .Lfunc_end1-.Lfunc_begin1
.cfi_endproc
# -- End function
.ident "Ubuntu clang version 15.0.6"
.section ".note.GNU-stack","",@progbits
.addrsig
```
Here is the assembly code that is generated when the above C++ code is compiled with g++ 12 with the `-mcpu=power6 --std=c++11 -maltivec -O2` options:
```
.file "vsum4ubs_snippet_021123.cpp"
.machine power7
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl _Z13SumsOf8Test_1Dv16_h
.type _Z13SumsOf8Test_1Dv16_h, @function
_Z13SumsOf8Test_1Dv16_h:
.LFB3:
.cfi_startproc
vxor 0,0,0
vsum4ubs 2,2,0
#APP
# 104 "vsum4ubs_snippet_021123.cpp" 1
vsum2sws 2,2,0
# 0 "" 2
#NO_APP
blr
.long 0
.byte 0,9,0,0,0,0,0,0
.cfi_endproc
.LFE3:
.size _Z13SumsOf8Test_1Dv16_h,.-_Z13SumsOf8Test_1Dv16_h
.align 2
.p2align 4,,15
.globl _Z13SumsOf8Test_2v
.type _Z13SumsOf8Test_2v, @function
_Z13SumsOf8Test_2v:
.LFB4:
.cfi_startproc
.LCF1:
0: addis 2,12,.TOC.-.LCF1@ha
addi 2,2,.TOC.-.LCF1@l
.localentry _Z13SumsOf8Test_2v,.-_Z13SumsOf8Test_2v
addis 9,2,.LC0@toc@ha
addi 9,9,.LC0@toc@l
lvx 2,0,9
blr
.long 0
.byte 0,9,0,0,0,0,0,0
.cfi_endproc
.LFE4:
.size _Z13SumsOf8Test_2v,.-_Z13SumsOf8Test_2v
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC0:
.long 54
.long 0
.long 993
.long 0
.ident "GCC: (Ubuntu 12.1.0-2ubuntu1~22.04) 12.1.0"
.section .note.GNU-stack,"",@progbits
```
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzcWl1zozjW_jXKjcoUEhibi1wkdjJvqjKd1LyZqdq5oQTItmZkwSLhJHOxv31L4sNgwECnZ7dqu7rdMTrn6HzpPA8QIiXbC0pvwfIeLLc3JFeHJLv9IzmIlBOl5E2YxJ-3_0czCpmEBG4Avgf4HqZZss_IEaoDUXBPBc2IohISAemHygg8yfzo5qGETEiV5ZFiiYDvTB1gxInYw0RAdaAwTd5plkaey-mCM5F_LPYih4pke6qAcwfsLbDvgGeXf4uv2Ekzsj8SmObyEBxJlCUArwHGJxqpJAMYA-yPyabsg_JpomGStCUr-VzEdAfLXdsXC_Pta8ZO2wITEc9jCoGzIVyxE42sA3AeLqQqv5J0crRt0WvBtiUHY214KlXMhOo4KsiRypREFMZUEcYhWN0315ngTFAYBESpjIW5okFgNl0HAeHv5FMGhYi-7NceBEERKMyF6dUYMqHgXZGt38o2M0Yu5aIDySABeFMYgv2WQoD9hqvYYTtdrCB4eX17-vnp94cgKPXZzmwT5owrJoIoEVIRoYIU4DUBy3sbLLfGGPYA9uAVSdSSrN0blMeTLTszLbuTLS9nWvYmW17NtLyebNmfaRnNKOHcGqLpRURzq4imlxHNqWM4uavDmV0dTu7q8NzVrZMKIYRGEOZMKM8NFJT50YbA2UKpiGJREBGpgLOp1vW8ujyplbExDTRbA8_WcGZpNGoDnKs5QZNy4s72dzlbw5utsZqdEzQpJ3hSTtaz_fXnd9Z3NOO8bmyctZGsOJOygua3NprX2xczbbLKvI5sTpZmXtRnSjUF0NIODhR8Tkj8q4N_o5F6-0wHKEzBLQLJ_iouIq9kMRfWM6ryTMBexsKEMhra4sWm-npz9MGe0LSzRWjlJNzoctofj-WfHALnsZqSd80Fs-tmhnV0xTr6snV8xTr-snXninWnz3rDNFhdtAtYbSHlktb0kYqY7c6r3cqfaBSczrxVE1RDQttGS3v1Dz-EQvcTaCzfWwS6IdXHnr-HO5slGAT3_3h7CF5-2T78EgR61ugqBOWF56e3t-eH4OHb9unuW026iylFP9LM7Pjnlkr1TAR92e0kVWZe2XXmdPrrWkzQRC3NRuH-16i-HnUXCfiath6xEF-lZSOsbDYpm83JOsO-NwM9kDjCnWZTvdlMb8zzVvZ7_Te4pU_qQOo7UPEAnAfoOeMGBzLSQYcBg01wNUk2OOe5fxu4DkHrecsutOooqtzpaC_S5KCq6zVojMBwIXc3yEca0sWWywdof6xK_OmCWymI2r6h7_ENzfMN9fj2g6CxD1xkfmzVtvETfgT4ET4VaEikpMeQf0ImYS5pDHdJZp4_alyDKoHklLAY5kLQiEpJsk8oD_lux5nYd2wmAnKmFKcL7SwR8DV5p9nrpnwmKSGR5sFlRmXOFUx25lu1W8dc--kn55DwjJL4E4YUsuIRaJRkGY0U5ERQ2dm_Y_HCH6uZQyKP5ZHBdfwAL23dQXiJiv8wwLhZtrJ4zh3UC872BDAum-qiwC3JSowUHdq8FPYczM4RbVV3iO_oDqoCv3y82MuL-h7w_X9-lC87fe6LYTbwoPA0AGP9Rv-iWWIWzDTUlE6mnCjNo-z8Mvh-EzI_1soNPnjS2azNj-bx2n1EIVumy7kDzl2X9Q2OygG10st2X3R9Nh0xYLk3uIvKXy3lG5UqQPMLWqbs3BCnL3mATfomNk3hEuHSzO2q7e_1oYTmA-sPR38s9cfaLJjv2Kw7rlkrFk2CkWsuYcdIoULMXxcqvg6n7pxu5ITL_uAvXraYz-r9j55X9biNkpgW73-YrF8BxfD9QIvBRsLkROsXRkaaSRglx5RxLVe_DCol0LK4pnWBZy8W1Wugbf9bosUxSvNq1YOLhVQxcLZRYQ4huDiW71Tg4gUDz4ZJqgexHHyxZPuWoh_q_I2E7EQzqac3Pl_dMU6B7ZdTVh-IQAqWplQFNkYIO1aUpvWU1Rp7noQc2H7wO3JaPbw9IS84QIAduFjAe7pnAu5yUQDGgPTZbIoJZ3sBbN9tRKAZzeBWGgNcu9qi7PQBUTPoHQhc-6on1rM2F4TaefucW9u3oh0LpCKZSrMkqlmAhqQwtFqip48kK5q__KgXqveIbuOMdBZxzyLAzt3ra0vWAOKA7LeXoCke8uwcB080U_AbbfLPnMTnK2UGqIjb8Wu6erUUsKG5aOexlUQq4nMKq255EDFsF9LsScsLvpUlMVHEiqTSdHkDMCY_m3d7ugXSLNmHTAPWBnm9LTWxK_GpSsLm9QkF7QyEn0pnAMHrf_RO9gf6YXq4XMGdFWeSRaejt5ykt-zorcuVdTe2EVcKvbgb20hSypx0XXHcKivdtCxH4ivTsurG50_Ki98N363caZ6rKkbnenIKRdpjdSQ7heKqq1gXyusmB6PrQZaVClvnALXOQTlBlu6UbDled_Rc1ysV7Y6e7zuVUYq-brWNjkOoVk2EuRClbyDG0QmfrgDTeRidZykaw6RSeE_TliiJY1bCBTKf1tvLxlo0pV37QFryDXTpk-atDfnFhhZPIsKpUNnnUHZgQ7Vl-xq4FoE4Z8-qDnVtlUTtIAaB2ER3XujYqO_HfH76KPaxuxZ63LD73OjfzR7Yze3ZreYGxoDbXvxOdEdT0L1VJ620aDfj9wI7i6lQBe38NcyFysvfpKoIKlpatuW1aOeZCwCMLZEoav307deFVCT6syQCGHcZwZn-xnEm2f6_cWNQ3xTg9k3B38H55_P5I4kOTJS_vrYau19ol8HM0KYxMyCb8uXMNE2r6dnyYuSO3xiM3gPAGTcB1Sl4vHdG2X05QPRZLP71UnXcWGqwdNP_yHbhaCUg6uf1DcMFeMGiwevsjvN8eMFGTCx-Hc_lv-HTbD0_PjjzbgesxWhlv94sTXS-BsTjLWKQ-Nwd7jjObh7PI7SAqAbKGpCtUFNL9qNrV4rPQdBujhsJKZzxq12eN8Pg5Jdt0RLqgmDRPf-ZbnMnw9PVJHz1FrLqx6rmdj8TvgbAZ_Z6TaqBiD9tNgUrXJfYiLCFLHuBc_MV_Qtjy3YB9suFAZjsYOQ1gKyA5Ca-dWLf8ckNvUXeykOev8L-zeHWt92d46_9MMahi3zkIGIvd34UI7KyvTC-YbfYxo6NEbaxi23f8u0dwbsVRjsaYT-iwLXpkTBucX46Wkm2v2FS5vTWs721e8NJSLk0v7qNsaDv0Cxqd5fbm-xW6yzCfC91YzKp5NmKYorT243hDzvCuHlNopHyyP6iMMlV623J8O9ylyrEXKGChJzGN3nGbw9KpQZzzbP7PVOHPLSi5Ajwo3ai_G-RZskf5jnwo3FdAvxoQvt3AAAA___qGxcj">