[cfe-dev] clang generates way more code than - Optimizer bug?
Dennis Luehring via cfe-dev
cfe-dev at lists.llvm.org
Fri Dec 3 07:28:03 PST 2021
>Yes, it's a lot of code, but that is not the best metric for fast code.
>I guess the only way to tell is to run and measure it.
90% of the generated code is nearly same to gcc and then comes a huge block of code
its a trivial for with xor and shift
Am 03.12.2021 um 14:58 schrieb Sjoerd Meijer:
> Hi Dennis,
>
> I believe GCC at optimisation level O2 doesn't enable the vectoriser, that's why you only get a scalar loop. With Clang at O2 vectorisation is enabled and you'll get a scalar and vector loop. Yes, it's a lot of code, but that is not the best metric for fast code. I guess the only way to tell is to run and measure it.
>
> Cheers,
> Sjoerd.
> ________________________________
> From: cfe-dev <cfe-dev-bounces at lists.llvm.org> on behalf of Dennis Luehring via cfe-dev <cfe-dev at lists.llvm.org>
> Sent: 03 December 2021 10:00
> To: cfe-dev <cfe-dev at lists.llvm.org>
> Subject: [cfe-dev] clang generates way more code than - Optimizer bug?
>
> void decipher(char* text_, int text_len_)
> {
> for (int i = text_len_ - 1; i >= 0; --i)
> {
> text_[i] ^= 0xff - (i << 2);
> }
> }
>
> gcc.godbolt.org/gcc-trunk
>
> https://gcc.godbolt.org/z/z84d9xdxE
>
> generates this code
>
> decipher(char*, int):
> sub esi, 1
> js .L1
> movsx rax, esi
> add rdi, rax
> .L3:
> lea eax, [0+rsi*4]
> xor al, BYTE PTR [rdi]
> sub esi, 1
> sub rdi, 1
> not eax
> mov BYTE PTR [rdi+1], al
> cmp esi, -1
> jne .L3
> .L1:
> ret
>
> gcc.godbolt.org/Clang-trunk
>
> https://gcc.godbolt.org/z/1GddGeGhh
>
> generates this huge code - maybe its better, but i don't think so
>
> .LCPI0_0:
> .quad -3 # 0xfffffffffffffffd
> .quad -4 # 0xfffffffffffffffc
> .LCPI0_1:
> .quad -5 # 0xfffffffffffffffb
> .quad -6 # 0xfffffffffffffffa
> .LCPI0_2:
> .quad -7 # 0xfffffffffffffff9
> .quad -8 # 0xfffffffffffffff8
> .LCPI0_3:
> .quad -9 # 0xfffffffffffffff7
> .quad -10 # 0xfffffffffffffff6
> .LCPI0_4:
> .quad -11 # 0xfffffffffffffff5
> .quad -12 # 0xfffffffffffffff4
> .LCPI0_5:
> .quad -13 # 0xfffffffffffffff3
> .quad -14 # 0xfffffffffffffff2
> .LCPI0_6:
> .quad -15 # 0xfffffffffffffff1
> .quad -16 # 0xfffffffffffffff0
> .LCPI0_7:
> .quad -1 # 0xffffffffffffffff
> .quad -2 # 0xfffffffffffffffe
> .LCPI0_8:
> .byte 255 # 0xff
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 255 # 0xff
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .byte 0 # 0x0
> .LCPI0_9:
> .zero 16,252
> decipher(char*, int): # @decipher(char*, int)
> test esi, esi
> jle .LBB0_12
> mov r9d, esi
> cmp esi, 16
> jae .LBB0_3
> .LBB0_9:
> mov rcx, r9
> .LBB0_10:
> lea rax, [rcx + 1]
> shl cl, 2
> mov dl, 3
> sub dl, cl
> .LBB0_11: # =>This Inner Loop Header: Depth=1
> lea ecx, [rax - 2]
> xor byte ptr [rdi + rcx], dl
> add rax, -1
> add dl, 4
> cmp rax, 1
> ja .LBB0_11
> .LBB0_12:
> ret
> .LBB0_3:
> lea rax, [r9 - 1]
> add esi, -1
> cmp esi, eax
> jb .LBB0_9
> shr rax, 32
> jne .LBB0_9
> mov r8d, r9d
> and r8d, -16
> mov ecx, r9d
> and ecx, 15
> movdqa xmm8, xmmword ptr [rip + .LCPI0_0] # xmm8 =
> [18446744073709551613,18446744073709551612]
> movdqa xmm9, xmmword ptr [rip + .LCPI0_1] # xmm9 =
> [18446744073709551611,18446744073709551610]
> movdqa xmm10, xmmword ptr [rip + .LCPI0_2] # xmm10 =
> [18446744073709551609,18446744073709551608]
> movdqa xmm11, xmmword ptr [rip + .LCPI0_3] # xmm11 =
> [18446744073709551607,18446744073709551606]
> movdqa xmm12, xmmword ptr [rip + .LCPI0_4] # xmm12 =
> [18446744073709551605,18446744073709551604]
> movdqa xmm13, xmmword ptr [rip + .LCPI0_5] # xmm13 =
> [18446744073709551603,18446744073709551602]
> movdqa xmm14, xmmword ptr [rip + .LCPI0_6] # xmm14 =
> [18446744073709551601,18446744073709551600]
> movdqa xmm15, xmmword ptr [rip + .LCPI0_7] # xmm15 =
> [18446744073709551615,18446744073709551614]
> movdqa xmm0, xmmword ptr [rip + .LCPI0_8] # xmm0 =
> [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
> movdqa xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 =
> [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
> pcmpeqd xmm2, xmm2
> pxor xmm3, xmm3
> mov rsi, r8
> mov rax, r9
> .LBB0_6: # =>This Inner Loop Header: Depth=1
> movq xmm4, rax
> pshufd xmm4, xmm4, 68 # xmm4 = xmm4[0,1,0,1]
> movdqa xmm5, xmm4
> paddq xmm5, xmm11
> movdqa xmm6, xmm4
> paddq xmm6, xmm13
> movdqa xmm7, xmm4
> paddq xmm7, xmm14
> pand xmm7, xmm0
> pand xmm6, xmm0
> packuswb xmm6, xmm7
> movdqa xmm7, xmm4
> paddq xmm7, xmm12
> pand xmm7, xmm0
> pand xmm5, xmm0
> packuswb xmm5, xmm7
> movdqa xmm7, xmm4
> paddq xmm7, xmm9
> packuswb xmm5, xmm6
> movdqa xmm6, xmm4
> paddq xmm6, xmm10
> pand xmm6, xmm0
> pand xmm7, xmm0
> packuswb xmm7, xmm6
> movdqa xmm6, xmm4
> paddq xmm6, xmm8
> paddq xmm4, xmm15
> movq rdx, xmm4
> pand xmm6, xmm0
> pand xmm4, xmm0
> packuswb xmm4, xmm6
> packuswb xmm4, xmm7
> mov edx, edx
> packuswb xmm4, xmm5
> psllw xmm4, 2
> pand xmm4, xmm1
> pxor xmm4, xmm2
> movdqa xmm5, xmm4
> punpcklbw xmm5, xmm3 # xmm5 =
> xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
> pshufd xmm5, xmm5, 78 # xmm5 = xmm5[2,3,0,1]
> pshuflw xmm5, xmm5, 27 # xmm5 = xmm5[3,2,1,0,4,5,6,7]
> pshufhw xmm5, xmm5, 27 # xmm5 = xmm5[0,1,2,3,7,6,5,4]
> punpckhbw xmm4, xmm3 # xmm4 =
> xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
> pshufd xmm4, xmm4, 78 # xmm4 = xmm4[2,3,0,1]
> pshuflw xmm4, xmm4, 27 # xmm4 = xmm4[3,2,1,0,4,5,6,7]
> pshufhw xmm4, xmm4, 27 # xmm4 = xmm4[0,1,2,3,7,6,5,4]
> packuswb xmm4, xmm5
> movdqu xmm5, xmmword ptr [rdi + rdx - 15]
> pxor xmm4, xmm5
> movdqu xmmword ptr [rdi + rdx - 15], xmm4
> add rax, -16
> add rsi, -16
> jne .LBB0_6
> cmp r8, r9
> jne .LBB0_10
> jmp .LBB0_12
>
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
>
More information about the cfe-dev
mailing list