[cfe-dev] clang generates way more code than - Optimizer bug?

Sjoerd Meijer via cfe-dev cfe-dev at lists.llvm.org
Fri Dec 3 05:58:37 PST 2021


Hi Dennis,

I believe GCC at optimisation level O2 doesn't enable the vectoriser, that's why you only get a scalar loop. With Clang at O2 vectorisation is enabled and you'll get a scalar and vector loop. Yes, it's a lot of code, but that is not the best metric for fast code. I guess the only way to tell is to run and measure it.

Cheers,
Sjoerd.
________________________________
From: cfe-dev <cfe-dev-bounces at lists.llvm.org> on behalf of Dennis Luehring via cfe-dev <cfe-dev at lists.llvm.org>
Sent: 03 December 2021 10:00
To: cfe-dev <cfe-dev at lists.llvm.org>
Subject: [cfe-dev] clang generates way more code than - Optimizer bug?

void decipher(char* text_, int text_len_)
{
   for (int i = text_len_ - 1; i >= 0; --i)
   {
     text_[i] ^= 0xff - (i << 2);
   }
}

gcc.godbolt.org/gcc-trunk

https://gcc.godbolt.org/z/z84d9xdxE

generates this code

decipher(char*, int):
   sub     esi, 1
   js      .L1
   movsx   rax, esi
   add     rdi, rax
.L3:
   lea     eax, [0+rsi*4]
   xor     al, BYTE PTR [rdi]
   sub     esi, 1
   sub     rdi, 1
   not     eax
   mov     BYTE PTR [rdi+1], al
   cmp     esi, -1
   jne     .L3
.L1:
   ret

gcc.godbolt.org/Clang-trunk

https://gcc.godbolt.org/z/1GddGeGhh

generates this huge code - maybe its better, but i don't think so

.LCPI0_0:
   .quad   -3                              # 0xfffffffffffffffd
   .quad   -4                              # 0xfffffffffffffffc
.LCPI0_1:
   .quad   -5                              # 0xfffffffffffffffb
   .quad   -6                              # 0xfffffffffffffffa
.LCPI0_2:
   .quad   -7                              # 0xfffffffffffffff9
   .quad   -8                              # 0xfffffffffffffff8
.LCPI0_3:
   .quad   -9                              # 0xfffffffffffffff7
   .quad   -10                             # 0xfffffffffffffff6
.LCPI0_4:
   .quad   -11                             # 0xfffffffffffffff5
   .quad   -12                             # 0xfffffffffffffff4
.LCPI0_5:
   .quad   -13                             # 0xfffffffffffffff3
   .quad   -14                             # 0xfffffffffffffff2
.LCPI0_6:
   .quad   -15                             # 0xfffffffffffffff1
   .quad   -16                             # 0xfffffffffffffff0
.LCPI0_7:
   .quad   -1                              # 0xffffffffffffffff
   .quad   -2                              # 0xfffffffffffffffe
.LCPI0_8:
   .byte   255                             # 0xff
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   255                             # 0xff
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
   .byte   0                               # 0x0
.LCPI0_9:
   .zero   16,252
decipher(char*, int):                         # @decipher(char*, int)
   test    esi, esi
   jle     .LBB0_12
   mov     r9d, esi
   cmp     esi, 16
   jae     .LBB0_3
.LBB0_9:
   mov     rcx, r9
.LBB0_10:
   lea     rax, [rcx + 1]
   shl     cl, 2
   mov     dl, 3
   sub     dl, cl
.LBB0_11:                               # =>This Inner Loop Header: Depth=1
   lea     ecx, [rax - 2]
   xor     byte ptr [rdi + rcx], dl
   add     rax, -1
   add     dl, 4
   cmp     rax, 1
   ja      .LBB0_11
.LBB0_12:
   ret
.LBB0_3:
   lea     rax, [r9 - 1]
   add     esi, -1
   cmp     esi, eax
   jb      .LBB0_9
   shr     rax, 32
   jne     .LBB0_9
   mov     r8d, r9d
   and     r8d, -16
   mov     ecx, r9d
   and     ecx, 15
   movdqa  xmm8, xmmword ptr [rip + .LCPI0_0] # xmm8 =
[18446744073709551613,18446744073709551612]
   movdqa  xmm9, xmmword ptr [rip + .LCPI0_1] # xmm9 =
[18446744073709551611,18446744073709551610]
   movdqa  xmm10, xmmword ptr [rip + .LCPI0_2] # xmm10 =
[18446744073709551609,18446744073709551608]
   movdqa  xmm11, xmmword ptr [rip + .LCPI0_3] # xmm11 =
[18446744073709551607,18446744073709551606]
   movdqa  xmm12, xmmword ptr [rip + .LCPI0_4] # xmm12 =
[18446744073709551605,18446744073709551604]
   movdqa  xmm13, xmmword ptr [rip + .LCPI0_5] # xmm13 =
[18446744073709551603,18446744073709551602]
   movdqa  xmm14, xmmword ptr [rip + .LCPI0_6] # xmm14 =
[18446744073709551601,18446744073709551600]
   movdqa  xmm15, xmmword ptr [rip + .LCPI0_7] # xmm15 =
[18446744073709551615,18446744073709551614]
   movdqa  xmm0, xmmword ptr [rip + .LCPI0_8] # xmm0 =
[255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
   movdqa  xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 =
[252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
   pcmpeqd xmm2, xmm2
   pxor    xmm3, xmm3
   mov     rsi, r8
   mov     rax, r9
.LBB0_6:                                # =>This Inner Loop Header: Depth=1
   movq    xmm4, rax
   pshufd  xmm4, xmm4, 68                  # xmm4 = xmm4[0,1,0,1]
   movdqa  xmm5, xmm4
   paddq   xmm5, xmm11
   movdqa  xmm6, xmm4
   paddq   xmm6, xmm13
   movdqa  xmm7, xmm4
   paddq   xmm7, xmm14
   pand    xmm7, xmm0
   pand    xmm6, xmm0
   packuswb        xmm6, xmm7
   movdqa  xmm7, xmm4
   paddq   xmm7, xmm12
   pand    xmm7, xmm0
   pand    xmm5, xmm0
   packuswb        xmm5, xmm7
   movdqa  xmm7, xmm4
   paddq   xmm7, xmm9
   packuswb        xmm5, xmm6
   movdqa  xmm6, xmm4
   paddq   xmm6, xmm10
   pand    xmm6, xmm0
   pand    xmm7, xmm0
   packuswb        xmm7, xmm6
   movdqa  xmm6, xmm4
   paddq   xmm6, xmm8
   paddq   xmm4, xmm15
   movq    rdx, xmm4
   pand    xmm6, xmm0
   pand    xmm4, xmm0
   packuswb        xmm4, xmm6
   packuswb        xmm4, xmm7
   mov     edx, edx
   packuswb        xmm4, xmm5
   psllw   xmm4, 2
   pand    xmm4, xmm1
   pxor    xmm4, xmm2
   movdqa  xmm5, xmm4
   punpcklbw       xmm5, xmm3              # xmm5 =
xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
   pshufd  xmm5, xmm5, 78                  # xmm5 = xmm5[2,3,0,1]
   pshuflw xmm5, xmm5, 27                  # xmm5 = xmm5[3,2,1,0,4,5,6,7]
   pshufhw xmm5, xmm5, 27                  # xmm5 = xmm5[0,1,2,3,7,6,5,4]
   punpckhbw       xmm4, xmm3              # xmm4 =
xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
   pshufd  xmm4, xmm4, 78                  # xmm4 = xmm4[2,3,0,1]
   pshuflw xmm4, xmm4, 27                  # xmm4 = xmm4[3,2,1,0,4,5,6,7]
   pshufhw xmm4, xmm4, 27                  # xmm4 = xmm4[0,1,2,3,7,6,5,4]
   packuswb        xmm4, xmm5
   movdqu  xmm5, xmmword ptr [rdi + rdx - 15]
   pxor    xmm4, xmm5
   movdqu  xmmword ptr [rdi + rdx - 15], xmm4
   add     rax, -16
   add     rsi, -16
   jne     .LBB0_6
   cmp     r8, r9
   jne     .LBB0_10
   jmp     .LBB0_12

_______________________________________________
cfe-dev mailing list
cfe-dev at lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20211203/4aa3306b/attachment-0001.html>


More information about the cfe-dev mailing list