[cfe-dev] clang generates way more code than - Optimizer bug?
Sjoerd Meijer via cfe-dev
cfe-dev at lists.llvm.org
Fri Dec 3 05:58:37 PST 2021
Hi Dennis,
I believe GCC at optimisation level O2 doesn't enable the vectoriser, that's why you only get a scalar loop. With Clang at O2 vectorisation is enabled and you'll get a scalar and vector loop. Yes, it's a lot of code, but that is not the best metric for fast code. I guess the only way to tell is to run and measure it.
Cheers,
Sjoerd.
________________________________
From: cfe-dev <cfe-dev-bounces at lists.llvm.org> on behalf of Dennis Luehring via cfe-dev <cfe-dev at lists.llvm.org>
Sent: 03 December 2021 10:00
To: cfe-dev <cfe-dev at lists.llvm.org>
Subject: [cfe-dev] clang generates way more code than - Optimizer bug?
void decipher(char* text_, int text_len_)
{
for (int i = text_len_ - 1; i >= 0; --i)
{
text_[i] ^= 0xff - (i << 2);
}
}
gcc.godbolt.org/gcc-trunk
https://gcc.godbolt.org/z/z84d9xdxE
generates this code
decipher(char*, int):
sub esi, 1
js .L1
movsx rax, esi
add rdi, rax
.L3:
lea eax, [0+rsi*4]
xor al, BYTE PTR [rdi]
sub esi, 1
sub rdi, 1
not eax
mov BYTE PTR [rdi+1], al
cmp esi, -1
jne .L3
.L1:
ret
gcc.godbolt.org/Clang-trunk
https://gcc.godbolt.org/z/1GddGeGhh
generates this huge code - maybe its better, but i don't think so
.LCPI0_0:
.quad -3 # 0xfffffffffffffffd
.quad -4 # 0xfffffffffffffffc
.LCPI0_1:
.quad -5 # 0xfffffffffffffffb
.quad -6 # 0xfffffffffffffffa
.LCPI0_2:
.quad -7 # 0xfffffffffffffff9
.quad -8 # 0xfffffffffffffff8
.LCPI0_3:
.quad -9 # 0xfffffffffffffff7
.quad -10 # 0xfffffffffffffff6
.LCPI0_4:
.quad -11 # 0xfffffffffffffff5
.quad -12 # 0xfffffffffffffff4
.LCPI0_5:
.quad -13 # 0xfffffffffffffff3
.quad -14 # 0xfffffffffffffff2
.LCPI0_6:
.quad -15 # 0xfffffffffffffff1
.quad -16 # 0xfffffffffffffff0
.LCPI0_7:
.quad -1 # 0xffffffffffffffff
.quad -2 # 0xfffffffffffffffe
.LCPI0_8:
.byte 255 # 0xff
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 255 # 0xff
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.byte 0 # 0x0
.LCPI0_9:
.zero 16,252
decipher(char*, int): # @decipher(char*, int)
test esi, esi
jle .LBB0_12
mov r9d, esi
cmp esi, 16
jae .LBB0_3
.LBB0_9:
mov rcx, r9
.LBB0_10:
lea rax, [rcx + 1]
shl cl, 2
mov dl, 3
sub dl, cl
.LBB0_11: # =>This Inner Loop Header: Depth=1
lea ecx, [rax - 2]
xor byte ptr [rdi + rcx], dl
add rax, -1
add dl, 4
cmp rax, 1
ja .LBB0_11
.LBB0_12:
ret
.LBB0_3:
lea rax, [r9 - 1]
add esi, -1
cmp esi, eax
jb .LBB0_9
shr rax, 32
jne .LBB0_9
mov r8d, r9d
and r8d, -16
mov ecx, r9d
and ecx, 15
movdqa xmm8, xmmword ptr [rip + .LCPI0_0] # xmm8 =
[18446744073709551613,18446744073709551612]
movdqa xmm9, xmmword ptr [rip + .LCPI0_1] # xmm9 =
[18446744073709551611,18446744073709551610]
movdqa xmm10, xmmword ptr [rip + .LCPI0_2] # xmm10 =
[18446744073709551609,18446744073709551608]
movdqa xmm11, xmmword ptr [rip + .LCPI0_3] # xmm11 =
[18446744073709551607,18446744073709551606]
movdqa xmm12, xmmword ptr [rip + .LCPI0_4] # xmm12 =
[18446744073709551605,18446744073709551604]
movdqa xmm13, xmmword ptr [rip + .LCPI0_5] # xmm13 =
[18446744073709551603,18446744073709551602]
movdqa xmm14, xmmword ptr [rip + .LCPI0_6] # xmm14 =
[18446744073709551601,18446744073709551600]
movdqa xmm15, xmmword ptr [rip + .LCPI0_7] # xmm15 =
[18446744073709551615,18446744073709551614]
movdqa xmm0, xmmword ptr [rip + .LCPI0_8] # xmm0 =
[255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
movdqa xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 =
[252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
pcmpeqd xmm2, xmm2
pxor xmm3, xmm3
mov rsi, r8
mov rax, r9
.LBB0_6: # =>This Inner Loop Header: Depth=1
movq xmm4, rax
pshufd xmm4, xmm4, 68 # xmm4 = xmm4[0,1,0,1]
movdqa xmm5, xmm4
paddq xmm5, xmm11
movdqa xmm6, xmm4
paddq xmm6, xmm13
movdqa xmm7, xmm4
paddq xmm7, xmm14
pand xmm7, xmm0
pand xmm6, xmm0
packuswb xmm6, xmm7
movdqa xmm7, xmm4
paddq xmm7, xmm12
pand xmm7, xmm0
pand xmm5, xmm0
packuswb xmm5, xmm7
movdqa xmm7, xmm4
paddq xmm7, xmm9
packuswb xmm5, xmm6
movdqa xmm6, xmm4
paddq xmm6, xmm10
pand xmm6, xmm0
pand xmm7, xmm0
packuswb xmm7, xmm6
movdqa xmm6, xmm4
paddq xmm6, xmm8
paddq xmm4, xmm15
movq rdx, xmm4
pand xmm6, xmm0
pand xmm4, xmm0
packuswb xmm4, xmm6
packuswb xmm4, xmm7
mov edx, edx
packuswb xmm4, xmm5
psllw xmm4, 2
pand xmm4, xmm1
pxor xmm4, xmm2
movdqa xmm5, xmm4
punpcklbw xmm5, xmm3 # xmm5 =
xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
pshufd xmm5, xmm5, 78 # xmm5 = xmm5[2,3,0,1]
pshuflw xmm5, xmm5, 27 # xmm5 = xmm5[3,2,1,0,4,5,6,7]
pshufhw xmm5, xmm5, 27 # xmm5 = xmm5[0,1,2,3,7,6,5,4]
punpckhbw xmm4, xmm3 # xmm4 =
xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
pshufd xmm4, xmm4, 78 # xmm4 = xmm4[2,3,0,1]
pshuflw xmm4, xmm4, 27 # xmm4 = xmm4[3,2,1,0,4,5,6,7]
pshufhw xmm4, xmm4, 27 # xmm4 = xmm4[0,1,2,3,7,6,5,4]
packuswb xmm4, xmm5
movdqu xmm5, xmmword ptr [rdi + rdx - 15]
pxor xmm4, xmm5
movdqu xmmword ptr [rdi + rdx - 15], xmm4
add rax, -16
add rsi, -16
jne .LBB0_6
cmp r8, r9
jne .LBB0_10
jmp .LBB0_12
_______________________________________________
cfe-dev mailing list
cfe-dev at lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20211203/4aa3306b/attachment-0001.html>
More information about the cfe-dev
mailing list