<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Hi Dennis,</div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
I believe GCC at optimisation level O2 doesn't enable the vectoriser, that's why you only get a scalar loop. With Clang at O2 vectorisation is enabled and you'll get a scalar and vector loop. Yes, it's a lot of code, but that is not the best metric for fast
code. I guess the only way to tell is to run and measure it.</div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div style="font-family: Calibri, Arial, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Cheers,<br>
Sjoerd.<br>
</div>
<div id="appendonsend"></div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> cfe-dev <cfe-dev-bounces@lists.llvm.org> on behalf of Dennis Luehring via cfe-dev <cfe-dev@lists.llvm.org><br>
<b>Sent:</b> 03 December 2021 10:00<br>
<b>To:</b> cfe-dev <cfe-dev@lists.llvm.org><br>
<b>Subject:</b> [cfe-dev] clang generates way more code than - Optimizer bug?</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">void decipher(char* text_, int text_len_)<br>
{<br>
for (int i = text_len_ - 1; i >= 0; --i)<br>
{<br>
text_[i] ^= 0xff - (i << 2);<br>
}<br>
}<br>
<br>
gcc.godbolt.org/gcc-trunk<br>
<br>
<a href="https://gcc.godbolt.org/z/z84d9xdxE">https://gcc.godbolt.org/z/z84d9xdxE</a><br>
<br>
generates this code<br>
<br>
decipher(char*, int):<br>
sub esi, 1<br>
js .L1<br>
movsx rax, esi<br>
add rdi, rax<br>
.L3:<br>
lea eax, [0+rsi*4]<br>
xor al, BYTE PTR [rdi]<br>
sub esi, 1<br>
sub rdi, 1<br>
not eax<br>
mov BYTE PTR [rdi+1], al<br>
cmp esi, -1<br>
jne .L3<br>
.L1:<br>
ret<br>
<br>
gcc.godbolt.org/Clang-trunk<br>
<br>
<a href="https://gcc.godbolt.org/z/1GddGeGhh">https://gcc.godbolt.org/z/1GddGeGhh</a><br>
<br>
generates this huge code - maybe its better, but i don't think so<br>
<br>
.LCPI0_0:<br>
.quad -3 # 0xfffffffffffffffd<br>
.quad -4 # 0xfffffffffffffffc<br>
.LCPI0_1:<br>
.quad -5 # 0xfffffffffffffffb<br>
.quad -6 # 0xfffffffffffffffa<br>
.LCPI0_2:<br>
.quad -7 # 0xfffffffffffffff9<br>
.quad -8 # 0xfffffffffffffff8<br>
.LCPI0_3:<br>
.quad -9 # 0xfffffffffffffff7<br>
.quad -10 # 0xfffffffffffffff6<br>
.LCPI0_4:<br>
.quad -11 # 0xfffffffffffffff5<br>
.quad -12 # 0xfffffffffffffff4<br>
.LCPI0_5:<br>
.quad -13 # 0xfffffffffffffff3<br>
.quad -14 # 0xfffffffffffffff2<br>
.LCPI0_6:<br>
.quad -15 # 0xfffffffffffffff1<br>
.quad -16 # 0xfffffffffffffff0<br>
.LCPI0_7:<br>
.quad -1 # 0xffffffffffffffff<br>
.quad -2 # 0xfffffffffffffffe<br>
.LCPI0_8:<br>
.byte 255 # 0xff<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 255 # 0xff<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.byte 0 # 0x0<br>
.LCPI0_9:<br>
.zero 16,252<br>
decipher(char*, int): # @decipher(char*, int)<br>
test esi, esi<br>
jle .LBB0_12<br>
mov r9d, esi<br>
cmp esi, 16<br>
jae .LBB0_3<br>
.LBB0_9:<br>
mov rcx, r9<br>
.LBB0_10:<br>
lea rax, [rcx + 1]<br>
shl cl, 2<br>
mov dl, 3<br>
sub dl, cl<br>
.LBB0_11: # =>This Inner Loop Header: Depth=1<br>
lea ecx, [rax - 2]<br>
xor byte ptr [rdi + rcx], dl<br>
add rax, -1<br>
add dl, 4<br>
cmp rax, 1<br>
ja .LBB0_11<br>
.LBB0_12:<br>
ret<br>
.LBB0_3:<br>
lea rax, [r9 - 1]<br>
add esi, -1<br>
cmp esi, eax<br>
jb .LBB0_9<br>
shr rax, 32<br>
jne .LBB0_9<br>
mov r8d, r9d<br>
and r8d, -16<br>
mov ecx, r9d<br>
and ecx, 15<br>
movdqa xmm8, xmmword ptr [rip + .LCPI0_0] # xmm8 = <br>
[18446744073709551613,18446744073709551612]<br>
movdqa xmm9, xmmword ptr [rip + .LCPI0_1] # xmm9 = <br>
[18446744073709551611,18446744073709551610]<br>
movdqa xmm10, xmmword ptr [rip + .LCPI0_2] # xmm10 = <br>
[18446744073709551609,18446744073709551608]<br>
movdqa xmm11, xmmword ptr [rip + .LCPI0_3] # xmm11 = <br>
[18446744073709551607,18446744073709551606]<br>
movdqa xmm12, xmmword ptr [rip + .LCPI0_4] # xmm12 = <br>
[18446744073709551605,18446744073709551604]<br>
movdqa xmm13, xmmword ptr [rip + .LCPI0_5] # xmm13 = <br>
[18446744073709551603,18446744073709551602]<br>
movdqa xmm14, xmmword ptr [rip + .LCPI0_6] # xmm14 = <br>
[18446744073709551601,18446744073709551600]<br>
movdqa xmm15, xmmword ptr [rip + .LCPI0_7] # xmm15 = <br>
[18446744073709551615,18446744073709551614]<br>
movdqa xmm0, xmmword ptr [rip + .LCPI0_8] # xmm0 = <br>
[255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]<br>
movdqa xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = <br>
[252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]<br>
pcmpeqd xmm2, xmm2<br>
pxor xmm3, xmm3<br>
mov rsi, r8<br>
mov rax, r9<br>
.LBB0_6: # =>This Inner Loop Header: Depth=1<br>
movq xmm4, rax<br>
pshufd xmm4, xmm4, 68 # xmm4 = xmm4[0,1,0,1]<br>
movdqa xmm5, xmm4<br>
paddq xmm5, xmm11<br>
movdqa xmm6, xmm4<br>
paddq xmm6, xmm13<br>
movdqa xmm7, xmm4<br>
paddq xmm7, xmm14<br>
pand xmm7, xmm0<br>
pand xmm6, xmm0<br>
packuswb xmm6, xmm7<br>
movdqa xmm7, xmm4<br>
paddq xmm7, xmm12<br>
pand xmm7, xmm0<br>
pand xmm5, xmm0<br>
packuswb xmm5, xmm7<br>
movdqa xmm7, xmm4<br>
paddq xmm7, xmm9<br>
packuswb xmm5, xmm6<br>
movdqa xmm6, xmm4<br>
paddq xmm6, xmm10<br>
pand xmm6, xmm0<br>
pand xmm7, xmm0<br>
packuswb xmm7, xmm6<br>
movdqa xmm6, xmm4<br>
paddq xmm6, xmm8<br>
paddq xmm4, xmm15<br>
movq rdx, xmm4<br>
pand xmm6, xmm0<br>
pand xmm4, xmm0<br>
packuswb xmm4, xmm6<br>
packuswb xmm4, xmm7<br>
mov edx, edx<br>
packuswb xmm4, xmm5<br>
psllw xmm4, 2<br>
pand xmm4, xmm1<br>
pxor xmm4, xmm2<br>
movdqa xmm5, xmm4<br>
punpcklbw xmm5, xmm3 # xmm5 = <br>
xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]<br>
pshufd xmm5, xmm5, 78 # xmm5 = xmm5[2,3,0,1]<br>
pshuflw xmm5, xmm5, 27 # xmm5 = xmm5[3,2,1,0,4,5,6,7]<br>
pshufhw xmm5, xmm5, 27 # xmm5 = xmm5[0,1,2,3,7,6,5,4]<br>
punpckhbw xmm4, xmm3 # xmm4 = <br>
xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]<br>
pshufd xmm4, xmm4, 78 # xmm4 = xmm4[2,3,0,1]<br>
pshuflw xmm4, xmm4, 27 # xmm4 = xmm4[3,2,1,0,4,5,6,7]<br>
pshufhw xmm4, xmm4, 27 # xmm4 = xmm4[0,1,2,3,7,6,5,4]<br>
packuswb xmm4, xmm5<br>
movdqu xmm5, xmmword ptr [rdi + rdx - 15]<br>
pxor xmm4, xmm5<br>
movdqu xmmword ptr [rdi + rdx - 15], xmm4<br>
add rax, -16<br>
add rsi, -16<br>
jne .LBB0_6<br>
cmp r8, r9<br>
jne .LBB0_10<br>
jmp .LBB0_12<br>
<br>
_______________________________________________<br>
cfe-dev mailing list<br>
cfe-dev@lists.llvm.org<br>
<a href="https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev">https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev</a><br>
</div>
</span></font></div>
</body>
</html>