[cfe-dev] clang generates way more code than - Optimizer bug?

Dennis Luehring via cfe-dev cfe-dev at lists.llvm.org
Fri Dec 3 07:28:03 PST 2021


>Yes, it's a lot of code, but that is not the best metric for fast code.
>I guess the only way to tell is to run and measure it.

90% of the generated code is nearly same to gcc and then comes a huge block of code

its a trivial for with xor and shift


Am 03.12.2021 um 14:58 schrieb Sjoerd Meijer:
> Hi Dennis,
>
> I believe GCC at optimisation level O2 doesn't enable the vectoriser, that's why you only get a scalar loop. With Clang at O2 vectorisation is enabled and you'll get a scalar and vector loop. Yes, it's a lot of code, but that is not the best metric for fast code. I guess the only way to tell is to run and measure it.
>
> Cheers,
> Sjoerd.
> ________________________________
> From: cfe-dev <cfe-dev-bounces at lists.llvm.org> on behalf of Dennis Luehring via cfe-dev <cfe-dev at lists.llvm.org>
> Sent: 03 December 2021 10:00
> To: cfe-dev <cfe-dev at lists.llvm.org>
> Subject: [cfe-dev] clang generates way more code than - Optimizer bug?
>
> void decipher(char* text_, int text_len_)
> {
>     for (int i = text_len_ - 1; i >= 0; --i)
>     {
>       text_[i] ^= 0xff - (i << 2);
>     }
> }
>
> gcc.godbolt.org/gcc-trunk
>
> https://gcc.godbolt.org/z/z84d9xdxE
>
> generates this code
>
> decipher(char*, int):
>     sub     esi, 1
>     js      .L1
>     movsx   rax, esi
>     add     rdi, rax
> .L3:
>     lea     eax, [0+rsi*4]
>     xor     al, BYTE PTR [rdi]
>     sub     esi, 1
>     sub     rdi, 1
>     not     eax
>     mov     BYTE PTR [rdi+1], al
>     cmp     esi, -1
>     jne     .L3
> .L1:
>     ret
>
> gcc.godbolt.org/Clang-trunk
>
> https://gcc.godbolt.org/z/1GddGeGhh
>
> generates this huge code - maybe its better, but i don't think so
>
> .LCPI0_0:
>     .quad   -3                              # 0xfffffffffffffffd
>     .quad   -4                              # 0xfffffffffffffffc
> .LCPI0_1:
>     .quad   -5                              # 0xfffffffffffffffb
>     .quad   -6                              # 0xfffffffffffffffa
> .LCPI0_2:
>     .quad   -7                              # 0xfffffffffffffff9
>     .quad   -8                              # 0xfffffffffffffff8
> .LCPI0_3:
>     .quad   -9                              # 0xfffffffffffffff7
>     .quad   -10                             # 0xfffffffffffffff6
> .LCPI0_4:
>     .quad   -11                             # 0xfffffffffffffff5
>     .quad   -12                             # 0xfffffffffffffff4
> .LCPI0_5:
>     .quad   -13                             # 0xfffffffffffffff3
>     .quad   -14                             # 0xfffffffffffffff2
> .LCPI0_6:
>     .quad   -15                             # 0xfffffffffffffff1
>     .quad   -16                             # 0xfffffffffffffff0
> .LCPI0_7:
>     .quad   -1                              # 0xffffffffffffffff
>     .quad   -2                              # 0xfffffffffffffffe
> .LCPI0_8:
>     .byte   255                             # 0xff
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   255                             # 0xff
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
>     .byte   0                               # 0x0
> .LCPI0_9:
>     .zero   16,252
> decipher(char*, int):                         # @decipher(char*, int)
>     test    esi, esi
>     jle     .LBB0_12
>     mov     r9d, esi
>     cmp     esi, 16
>     jae     .LBB0_3
> .LBB0_9:
>     mov     rcx, r9
> .LBB0_10:
>     lea     rax, [rcx + 1]
>     shl     cl, 2
>     mov     dl, 3
>     sub     dl, cl
> .LBB0_11:                               # =>This Inner Loop Header: Depth=1
>     lea     ecx, [rax - 2]
>     xor     byte ptr [rdi + rcx], dl
>     add     rax, -1
>     add     dl, 4
>     cmp     rax, 1
>     ja      .LBB0_11
> .LBB0_12:
>     ret
> .LBB0_3:
>     lea     rax, [r9 - 1]
>     add     esi, -1
>     cmp     esi, eax
>     jb      .LBB0_9
>     shr     rax, 32
>     jne     .LBB0_9
>     mov     r8d, r9d
>     and     r8d, -16
>     mov     ecx, r9d
>     and     ecx, 15
>     movdqa  xmm8, xmmword ptr [rip + .LCPI0_0] # xmm8 =
> [18446744073709551613,18446744073709551612]
>     movdqa  xmm9, xmmword ptr [rip + .LCPI0_1] # xmm9 =
> [18446744073709551611,18446744073709551610]
>     movdqa  xmm10, xmmword ptr [rip + .LCPI0_2] # xmm10 =
> [18446744073709551609,18446744073709551608]
>     movdqa  xmm11, xmmword ptr [rip + .LCPI0_3] # xmm11 =
> [18446744073709551607,18446744073709551606]
>     movdqa  xmm12, xmmword ptr [rip + .LCPI0_4] # xmm12 =
> [18446744073709551605,18446744073709551604]
>     movdqa  xmm13, xmmword ptr [rip + .LCPI0_5] # xmm13 =
> [18446744073709551603,18446744073709551602]
>     movdqa  xmm14, xmmword ptr [rip + .LCPI0_6] # xmm14 =
> [18446744073709551601,18446744073709551600]
>     movdqa  xmm15, xmmword ptr [rip + .LCPI0_7] # xmm15 =
> [18446744073709551615,18446744073709551614]
>     movdqa  xmm0, xmmword ptr [rip + .LCPI0_8] # xmm0 =
> [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
>     movdqa  xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 =
> [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
>     pcmpeqd xmm2, xmm2
>     pxor    xmm3, xmm3
>     mov     rsi, r8
>     mov     rax, r9
> .LBB0_6:                                # =>This Inner Loop Header: Depth=1
>     movq    xmm4, rax
>     pshufd  xmm4, xmm4, 68                  # xmm4 = xmm4[0,1,0,1]
>     movdqa  xmm5, xmm4
>     paddq   xmm5, xmm11
>     movdqa  xmm6, xmm4
>     paddq   xmm6, xmm13
>     movdqa  xmm7, xmm4
>     paddq   xmm7, xmm14
>     pand    xmm7, xmm0
>     pand    xmm6, xmm0
>     packuswb        xmm6, xmm7
>     movdqa  xmm7, xmm4
>     paddq   xmm7, xmm12
>     pand    xmm7, xmm0
>     pand    xmm5, xmm0
>     packuswb        xmm5, xmm7
>     movdqa  xmm7, xmm4
>     paddq   xmm7, xmm9
>     packuswb        xmm5, xmm6
>     movdqa  xmm6, xmm4
>     paddq   xmm6, xmm10
>     pand    xmm6, xmm0
>     pand    xmm7, xmm0
>     packuswb        xmm7, xmm6
>     movdqa  xmm6, xmm4
>     paddq   xmm6, xmm8
>     paddq   xmm4, xmm15
>     movq    rdx, xmm4
>     pand    xmm6, xmm0
>     pand    xmm4, xmm0
>     packuswb        xmm4, xmm6
>     packuswb        xmm4, xmm7
>     mov     edx, edx
>     packuswb        xmm4, xmm5
>     psllw   xmm4, 2
>     pand    xmm4, xmm1
>     pxor    xmm4, xmm2
>     movdqa  xmm5, xmm4
>     punpcklbw       xmm5, xmm3              # xmm5 =
> xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
>     pshufd  xmm5, xmm5, 78                  # xmm5 = xmm5[2,3,0,1]
>     pshuflw xmm5, xmm5, 27                  # xmm5 = xmm5[3,2,1,0,4,5,6,7]
>     pshufhw xmm5, xmm5, 27                  # xmm5 = xmm5[0,1,2,3,7,6,5,4]
>     punpckhbw       xmm4, xmm3              # xmm4 =
> xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
>     pshufd  xmm4, xmm4, 78                  # xmm4 = xmm4[2,3,0,1]
>     pshuflw xmm4, xmm4, 27                  # xmm4 = xmm4[3,2,1,0,4,5,6,7]
>     pshufhw xmm4, xmm4, 27                  # xmm4 = xmm4[0,1,2,3,7,6,5,4]
>     packuswb        xmm4, xmm5
>     movdqu  xmm5, xmmword ptr [rdi + rdx - 15]
>     pxor    xmm4, xmm5
>     movdqu  xmmword ptr [rdi + rdx - 15], xmm4
>     add     rax, -16
>     add     rsi, -16
>     jne     .LBB0_6
>     cmp     r8, r9
>     jne     .LBB0_10
>     jmp     .LBB0_12
>
> _______________________________________________
> cfe-dev mailing list
> cfe-dev at lists.llvm.org
> https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
>



More information about the cfe-dev mailing list