<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/62293>62293</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
std::reduce has bad floating point code generation on x64
</td>
</tr>
<tr>
<th>Labels</th>
<td>
new issue
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
chriselrod
</td>
</tr>
</table>
<pre>
I looked for a previous issue but did not find one. I may have missed it; I'd be surprised if no one reported this before.
Example:
https://godbolt.org/z/vhEcYGMKr
This uses `std::vector` and shows `std::reduce` and `std::reduce(std::execution::unseq`, both with `-fassociative-math -fno-signed-zeros` and without. I added gcc and llvm-based icx for comparison.
cppreference says
> The behavior is non-deterministic if binary_op is not associative or not commutative.
https://en.cppreference.com/w/cpp/algorithm/reduce
I'm not sure what exactly the standard says, but I'd interpret this as meaning associativity is allowed for floating point.
However, neither clang or gcc reordered the floating point operations without the associative flags. icx did, but it also reordered a loop, clearly violating IEEE.
`unseq` doesn't help in this respect, but I guess it's about legality of parallel execution of iterations, rather than about associativity of a reduction.
With `-fassociative-math -fno-signed-zeros`, clang's code gen was bad:
```asm
sum_reduce(std::vector<double, std::allocator<double> > const&): # @sum_reduce(std::vector<double, std::allocator<double> > const&)
mov rsi, qword ptr [rdi]
mov rax, qword ptr [rdi + 8]
mov rdx, rax
sub rdx, rsi
vxorpd xmm0, xmm0, xmm0
cmp rdx, 25
jl .LBB0_1
add rdx, -25
cmp rdx, 480
jae .LBB0_7
mov rcx, rsi
jmp .LBB0_10
.LBB0_1:
mov rcx, rsi
jmp .LBB0_2
.LBB0_7:
shr rdx, 5
inc rdx
mov rdi, rdx
and rdi, -16
mov rcx, rdi
shl rcx, 5
add rcx, rsi
add rsi, 384
vxorpd xmm0, xmm0, xmm0
mov r8, rdi
vxorpd xmm1, xmm1, xmm1
vxorpd xmm2, xmm2, xmm2
vxorpd xmm3, xmm3, xmm3
.LBB0_8: # =>This Inner Loop Header: Depth=1
vmovupd ymm4, ymmword ptr [rsi - 384]
vmovupd ymm5, ymmword ptr [rsi - 352]
vmovupd ymm6, ymmword ptr [rsi - 320]
vmovupd ymm7, ymmword ptr [rsi - 288]
vperm2f128 ymm8, ymm4, ymm6, 32 # ymm8 = ymm4[0,1],ymm6[0,1]
vperm2f128 ymm9, ymm5, ymm7, 32 # ymm9 = ymm5[0,1],ymm7[0,1]
vperm2f128 ymm4, ymm4, ymm6, 49 # ymm4 = ymm4[2,3],ymm6[2,3]
vperm2f128 ymm5, ymm5, ymm7, 49 # ymm5 = ymm5[2,3],ymm7[2,3]
vunpcklpd ymm6, ymm8, ymm9 # ymm6 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
vaddpd ymm0, ymm6, ymm0
vunpcklpd ymm6, ymm4, ymm5 # ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
vunpckhpd ymm7, ymm8, ymm9 # ymm7 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
vaddpd ymm6, ymm7, ymm6
vaddpd ymm0, ymm0, ymm6
vunpckhpd ymm4, ymm4, ymm5 # ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
vaddpd ymm0, ymm0, ymm4
vmovupd ymm4, ymmword ptr [rsi - 256]
vmovupd ymm5, ymmword ptr [rsi - 224]
vmovupd ymm6, ymmword ptr [rsi - 192]
vmovupd ymm7, ymmword ptr [rsi - 160]
vperm2f128 ymm8, ymm4, ymm6, 32 # ymm8 = ymm4[0,1],ymm6[0,1]
vperm2f128 ymm9, ymm5, ymm7, 32 # ymm9 = ymm5[0,1],ymm7[0,1]
vperm2f128 ymm4, ymm4, ymm6, 49 # ymm4 = ymm4[2,3],ymm6[2,3]
vperm2f128 ymm5, ymm5, ymm7, 49 # ymm5 = ymm5[2,3],ymm7[2,3]
vunpcklpd ymm6, ymm8, ymm9 # ymm6 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
vaddpd ymm1, ymm6, ymm1
vunpcklpd ymm6, ymm4, ymm5 # ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
vunpckhpd ymm7, ymm8, ymm9 # ymm7 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
vaddpd ymm6, ymm7, ymm6
vaddpd ymm1, ymm1, ymm6
vunpckhpd ymm4, ymm4, ymm5 # ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
vaddpd ymm1, ymm1, ymm4
vmovupd ymm4, ymmword ptr [rsi - 128]
vmovupd ymm5, ymmword ptr [rsi - 96]
vmovupd ymm6, ymmword ptr [rsi - 64]
vmovupd ymm7, ymmword ptr [rsi - 32]
vperm2f128 ymm8, ymm4, ymm6, 32 # ymm8 = ymm4[0,1],ymm6[0,1]
vperm2f128 ymm9, ymm5, ymm7, 32 # ymm9 = ymm5[0,1],ymm7[0,1]
vperm2f128 ymm4, ymm4, ymm6, 49 # ymm4 = ymm4[2,3],ymm6[2,3]
vperm2f128 ymm5, ymm5, ymm7, 49 # ymm5 = ymm5[2,3],ymm7[2,3]
vunpcklpd ymm6, ymm8, ymm9 # ymm6 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
vaddpd ymm2, ymm6, ymm2
vunpcklpd ymm6, ymm4, ymm5 # ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
vunpckhpd ymm7, ymm8, ymm9 # ymm7 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
vaddpd ymm6, ymm7, ymm6
vaddpd ymm2, ymm2, ymm6
vunpckhpd ymm4, ymm4, ymm5 # ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
vaddpd ymm2, ymm2, ymm4
vmovupd ymm4, ymmword ptr [rsi]
vmovupd ymm5, ymmword ptr [rsi + 32]
vmovupd ymm6, ymmword ptr [rsi + 64]
vmovupd ymm7, ymmword ptr [rsi + 96]
vperm2f128 ymm8, ymm4, ymm6, 32 # ymm8 = ymm4[0,1],ymm6[0,1]
vperm2f128 ymm9, ymm5, ymm7, 32 # ymm9 = ymm5[0,1],ymm7[0,1]
vperm2f128 ymm4, ymm4, ymm6, 49 # ymm4 = ymm4[2,3],ymm6[2,3]
vperm2f128 ymm5, ymm5, ymm7, 49 # ymm5 = ymm5[2,3],ymm7[2,3]
vunpcklpd ymm6, ymm8, ymm9 # ymm6 = ymm8[0],ymm9[0],ymm8[2],ymm9[2]
vaddpd ymm3, ymm6, ymm3
vunpcklpd ymm6, ymm4, ymm5 # ymm6 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
vunpckhpd ymm7, ymm8, ymm9 # ymm7 = ymm8[1],ymm9[1],ymm8[3],ymm9[3]
vaddpd ymm6, ymm7, ymm6
vaddpd ymm3, ymm3, ymm6
vunpckhpd ymm4, ymm4, ymm5 # ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
vaddpd ymm3, ymm3, ymm4
add rsi, 512
add r8, -16
jne .LBB0_8
vaddpd ymm0, ymm1, ymm0
vaddpd ymm0, ymm2, ymm0
vaddpd ymm0, ymm3, ymm0
vextractf128 xmm1, ymm0, 1
vaddpd xmm0, xmm0, xmm1
vpermilpd xmm1, xmm0, 1 # xmm1 = xmm0[1,0]
vaddsd xmm0, xmm0, xmm1
cmp rdx, rdi
je .LBB0_2
.LBB0_10:
mov rdx, rax
sub rdx, rcx
.LBB0_11: # =>This Inner Loop Header: Depth=1
vmovupd xmm1, xmmword ptr [rcx]
vaddpd xmm1, xmm1, xmmword ptr [rcx + 16]
vpermilpd xmm2, xmm1, 1 # xmm2 = xmm1[1,0]
vaddsd xmm1, xmm1, xmm2
vaddsd xmm0, xmm0, xmm1
add rcx, 32
add rdx, -32
cmp rdx, 24
jg .LBB0_11
.LBB0_2:
cmp rcx, rax
je .LBB0_16
mov rsi, rax
sub rsi, rcx
add rsi, -8
cmp rsi, 248
jae .LBB0_12
mov rdx, rcx
jmp .LBB0_15
.LBB0_12:
shr rsi, 3
inc rsi
mov rdi, rsi
and rdi, -32
lea rdx, [rcx + 8*rdi]
vmovq xmm0, xmm0 # xmm0 = xmm0[0],zero
vxorpd xmm1, xmm1, xmm1
xor r8d, r8d
vxorpd xmm2, xmm2, xmm2
vxorpd xmm3, xmm3, xmm3
.LBB0_13: # =>This Inner Loop Header: Depth=1
vaddpd zmm0, zmm0, zmmword ptr [rcx + 8*r8]
vaddpd zmm1, zmm1, zmmword ptr [rcx + 8*r8 + 64]
vaddpd zmm2, zmm2, zmmword ptr [rcx + 8*r8 + 128]
vaddpd zmm3, zmm3, zmmword ptr [rcx + 8*r8 + 192]
add r8, 32
cmp rdi, r8
jne .LBB0_13
vaddpd zmm0, zmm1, zmm0
vaddpd zmm1, zmm3, zmm2
vaddpd zmm0, zmm1, zmm0
vextractf64x4 ymm1, zmm0, 1
vaddpd zmm0, zmm0, zmm1
vextractf128 xmm1, ymm0, 1
vaddpd xmm0, xmm0, xmm1
vpermilpd xmm1, xmm0, 1 # xmm1 = xmm0[1,0]
vaddsd xmm0, xmm0, xmm1
cmp rsi, rdi
je .LBB0_16
.LBB0_15: # =>This Inner Loop Header: Depth=1
vaddsd xmm0, xmm0, qword ptr [rdx]
add rdx, 8
cmp rdx, rax
jne .LBB0_15
.LBB0_16:
vzeroupper
ret
```
This herculean effort with permutations and unpacking is unnecessary and seriously hurts performance.
Using all `-funsafe-math-optimizations` doesn't change anything.
Code looks good when adding `std::execution::unseq`, as well as for the simple loop (that is, when assuming associative math and no signed zeros was specified for both cases).
</pre>
<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzsWkuTozgS_jXUJcMVIGyMD3Xoeu107OxtNjb21CGjtFE1SLQkXHb9-g2JhwU2rsfM9PahHDNNQEr51ieVMqnWfCsQb4LFbbC4v6K1yaW6yXLFNRZKsqu1ZIebr1BI-R0ZbKQCCpXCHZe1Bq51jbCuDTDOQEgDGy4YSIHX8BVKeoCc7hBKrjUy4CaIb-FrQJYM1gi6VpUVw4BvQEg7CxRWUhlkYHKuYY0bqfA6CO-D8Evz78OellWBQdy-58ZU2r6Rx4A8biVby8JcS7UNyONLQB53-UP233_865-qGf6HZVtr1BAkoTbMzoy_7DAzUgVJCFQw0Ll8HtIVsjrDjn6GQtL-C-4xqw2XonmthcYfQRIG5A7W0uTwzE1uWcw2VGuZcWr4DmclNTnMNkLOXDzY7AWV1J1EO0fWxvqUMoYMtlnmCEWxK2dr6nyY7V10MllWVHEtxcBvWVUp3KBCkSFoetAtMX6AP3KENeZ0x6UCrkFIMWNoUJVccG14ZuOz5oKqwzdZNSMMeNqDVO5TJsuyNu7TQPYwRiiufWWuM1kG5PE5II9ZVQXkkRZbqbjJ7dfWvR4vmz2lk6ZrhfCcUwO4p5kpDmByBG2oYFSxxkbr9Nq0KceFQVUpNE1yUQ0lUsHF1rOFm4M1kBaFfG7TfVNIauyoSnJhWsN-k8-4Q2UFCOQmRwVZQcXWusIGR6FUDJVLZByxAFmhojZFdBdZN8r36KagW33tgso46-zgBmihpced2oVZWXpWIFXFAXZcFo2wrw8PD4M4BEnYpSMwiVoEZGkgx6ICLhqnKNQVZqZ3HGxr1NquXLLUQNdW1wK3tLCOkhuoqKJFgQX0aW-_ctMZaBkp6vxjcipaDkN_yw1QcJG2U4Yau3__874l03iDiq3TOZMMYYsCnqmGNWU9ctiR7j-qy-aLrstvpwu6BYf4jsl6XaDl3tNsnmR0QI4fwP6fSaFNQJKArIL4C7hfQGII5uHfJcbZAO2vlDv3VJpbVj-epWJQGQXB4lYxHizu2_H9QLo_NxACcgvpcfiYPds3Id63dF2vBwTNhxN3e6kqBrAvSxeowXMwMisrnxNZDMlPRfO8_v32NvwWDYmUMX_ubDzZ8m5p83Qk94mix3g5YXc2Mu-p1bZVp-XZKdfl3KtsOh3KquVEfEbLE0Y6V76hIzO5yHrqKNrMpcWR0DlOMJ8-i5LLijM-1qfw6YuJsEyYbeltwsbp_KOJ0-uYDlX0OETtzONzShZpRxyfJ8zilnR8eiFL-9U__XO4EN8H8YM7nnwVAhX8LmUFvyFlqCyLe6xMHsT3Y01LuasrBoeynFvxh7IcLGDNYeZ8OV7A3sTF9MQFuTQxmZ5IwksTl5MTSXqKNbsKVUk2EUmb90NZpi2DzmanSUw6Z9oR1qPNkMWtTZTI8iV3brD35TVJq1ZC56TliaRVJ2lxImn5HknzczbNV56kuW-TTch4YFP_5TVJi3M2zVedmIVv0FDMclpMLarse1Ex6MV06dEFqxeQdAJS55-O-WrwljpRPu1MLlLGrMRDiwWe0PDN6nX-XvjL0ddyPtBrMXibD7RcnNfSyc592csp1yx910QD86OBa-IB7Uw8jq5J_DC799ecGE6MPDVknLRjJw5SNho4Kho4MR7QJu05o-R4n3gLIpJFcpTwJigk5CKGTkNhtLqIodNQGCUehv4EDPwJ4PcTUO8dcHcU8fch3gSk_IXAF42ArzsgfCLeWxAv6r32CyPeWMmPIF5E0nci3ir5GOAlF4FyGu9i8gl3n3D3GtyREdyRT7h7B9yR3mu_MNyNlXw33L0L6AJyO4Cet0OdnfkxsLMzfXj9hLtPuDu_YOMR3MWfcPcOuIt7r_3CcDdWcj5xZ9rciy4iMkF3vj9e2D4J_xo7ffVv_mji4uTk725ycaA_Mp4YiXujaGbcMt77kskdjO83W65nbn3HIy00cLsivHvdhiUcfzaMlu7C6AbYwJG7M9eVlDH9JsmjQsXJnfhTE4hzF_pReLzRnyyptL9xZSXbDzhFQfzlr7hJ9rw32Lay_eSaPHOhPprqdrxovOPxI355N-zRRMxIF7PoNGZesMaakI-G1RUjmkJFPLXm2vLSmD4uXXVL-mnrZ0IU-QEkJ7Wdnkt2PiGesGM0VaRpIGM6k1p6l0ljrJmlEwo1ZDJPL1XOeqAaJ3Y2tmNYOlsM0vrULTpXfZFootw1riuNq14ndSfB-orXOJYFUl97L5_TgHwZlFK7FCvl7gcMUwzO_trMDn00avfiF1Rysij1SvlqL9uiYOo6B-zjveWt05EXq1xR_GfRp4WSl9Zn3vMclDjXn6kUHblE7ezoVS6T5_gjM9IyIW9jNrxrObKJ2-nxG9mcuTl2oJSew6Qj5jQ5Pl6bg-NAFJ9qF448NrG_eyPi3i3vZtYdAZL5fg7Hm7iX0SngQlqME8g_VABMnis-cqBoV9X_61jRovTlY0W3B_Qg-mrZ-U-u1nNGjJpHTk8Mo41zan-ZOAHZHD67SSQnm8TO4mddVaiG3xWa5kPf_dO8OhfkqLK6QCoANxupTNOrZ9PAdbVJod1OUYuKZt-52ALXUAuBGWpN1aFpHkTFZa2LA-S1MtrO3khVUpF1PXH_1q7lrCialqZaaLppuplmsjK85C9t69SgSyvLqdgiUHEwORfbltedZOhaMzVspWTwnKOwTrYS_DbFC02JVMMzFoV9bqRqeuh4WRWObwUBSU1ODXDXytXw17ouh21zCK4byzpASGj6scD1Y7m2K11hxje8bahzbZAZ1agDsrq-YjcxW8UreoU3UZKSKJ4vSXqV3yQMl0hStiBzglFE0jBcRJswXK6y5TJdh1f8hoQkDueEhHGcztPrBV3gKmRJtEqiZbJJg3mIJeXFdVHsymuptleuX_UmIWQVXxV0jYV2ja-ECHxumlkDYkH3St00rZX1VgfzsODa6CMXw02BN6MeUMib_rJxs1_Xf9Z2xIEUsE_mV7Uqbka9q9zk9bpth7Sy2sesUvLJ9eQ9Og11QB6dBf8LAAD__9quAd8">