[llvm-bugs] [Bug 24924] New: 27% performance deficiency vs gcc when compiling zlib (compress function)

Thu Sep 24 07:16:08 PDT 2015

https://llvm.org/bugs/show_bug.cgi?id=24924

            Bug ID: 24924
           Summary: 27% performance deficiency vs gcc when compiling zlib
                    (compress function)
           Product: clang
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: normal
          Priority: P
         Component: LLVM Codegen
          Assignee: unassignedclangbugs at nondot.org
          Reporter: egor.kochetov at intel.com
                CC: llvm-bugs at lists.llvm.org
    Classification: Unclassified

Created attachment 14930
  --> https://llvm.org/bugs/attachment.cgi?id=14930&action=edit
Archive with c++ source generating relevant data and calling zlib compress,
Makefile, 2 annotated assemblies.

I have compared gcc 6.0 trunk with clang 3.8 trunk and found out that clang
produces inefficient binary code for the 'compress' function from the zlib
library when compiling 32 bit source for a Silvermont CPU. Binary compiled by
gcc runs 27% faster than if compiled by clang.

Specifically, here is the C source which is a part of the function
'deflate_slow' from the file 'zlib/deflate.c', lines near 1806 (with macros
expanded):

            do {
                if (++s->strstart <= max_insert) {
                    s->ins_h = ((s->ins_h<<s->hash_shift)
                                 ^ (s->window[(s->strstart) + (MIN_MATCH-1)])
                               ) & s->hash_mask;
                    hash_head = s->prev[(s->strstart) & s->w_mask] =
s->head[s->ins_h];
                    s->head[s->ins_h] = (Pos)(s->strstart);
                }
            } while (--s->prev_length != 0);

The original source with original macros is this one:

            do {
                if (++s->strstart <= max_insert) {
                    INSERT_STRING(s, s->strstart, hash_head);
                }
            } while (--s->prev_length != 0);

Here is its gcc output (the fast one, 30 ms per function call):

 ### do { //source line 1806
 804af10:    8d 41 01                 lea    0x1(%ecx),%eax
 804af13:    89 45 6c                 mov    %eax,0x6c(%ebp)
 804af16:    39 44 24 1c              cmp    %eax,0x1c(%esp)
 ### if (++s->strstart <= max_insert) {
 804af1a:    72 32                    jb     804af4e <deflate_slow+0x1be>
 804af1c:    8b 75 38                 mov    0x38(%ebp),%esi
 804af1f:    8b 7d 48                 mov    0x48(%ebp),%edi
 804af22:    0f b6 74 0e 03           movzbl 0x3(%esi,%ecx,1),%esi
 804af27:    8b 4d 58                 mov    0x58(%ebp),%ecx
 804af2a:    d3 e7                    shl    %cl,%edi ### 1.5294 ms
 804af2c:    89 f9                    mov    %edi,%ecx
 804af2e:    8b 7d 34                 mov    0x34(%ebp),%edi
 804af31:    31 f1                    xor    %esi,%ecx
 804af33:    8b 75 44                 mov    0x44(%ebp),%esi
 804af36:    23 4d 54                 and    0x54(%ebp),%ecx ### 1.5 ms
 804af39:    21 c7                    and    %eax,%edi
 804af3b:    89 4d 48                 mov    %ecx,0x48(%ebp)
 804af3e:    8d 0c 4e                 lea    (%esi,%ecx,2),%ecx
 804af41:    8b 75 40                 mov    0x40(%ebp),%esi
 804af44:    0f b7 19                 movzwl (%ecx),%ebx ### 1.8 ms
 804af47:    66 89 1c 7e              mov    %bx,(%esi,%edi,2)
 804af4b:    66 89 01                 mov    %ax,(%ecx)
 ### }
 804af4e:    83 ea 01                 sub    $0x1,%edx
 804af51:    89 c1                    mov    %eax,%ecx
 804af53:    89 55 78                 mov    %edx,0x78(%ebp)
 804af56:    85 d2                    test   %edx,%edx
 804af58:    75 b6                    jne    804af10 <deflate_slow+0x180>
 ### } while (--s->prev_length != 0);
Here is its clang output (27% slower, 41 ms per function call):

 ### do { //source line 1806
 804cd90:    89 c3                    mov    %eax,%ebx
 804cd92:    8d 04 1a                 lea    (%edx,%ebx,1),%eax
 804cd95:    3b 44 24 44              cmp    0x44(%esp),%eax
 804cd99:    89 45 6c                 mov    %eax,0x6c(%ebp)
 ### if (++s->strstart <= max_insert) {
 804cd9c:    77 5c                    ja     804cdfa <deflate_slow+0x3aa>
 804cd9e:    8b 75 48                 mov    0x48(%ebp),%esi
 804cda1:    8a 4d 58                 mov    0x58(%ebp),%cl
 804cda4:    89 5c 24 34              mov    %ebx,0x34(%esp)
 804cda8:    d3 e6                    shl    %cl,%esi ### 1.5294 ms
 804cdaa:    8b 4d 38                 mov    0x38(%ebp),%ecx
 804cdad:    03 4c 24 28              add    0x28(%esp),%ecx
 804cdb1:    0f b6 0c 0b              movzbl (%ebx,%ecx,1),%ecx ### 3.7 ms
 804cdb5:    31 f1                    xor    %esi,%ecx
 804cdb7:    23 4d 54                 and    0x54(%ebp),%ecx
 804cdba:    89 4d 48                 mov    %ecx,0x48(%ebp)
 804cdbd:    8b 55 40                 mov    0x40(%ebp),%edx
 804cdc0:    8b 75 34                 mov    0x34(%ebp),%esi
 804cdc3:    89 54 24 40              mov    %edx,0x40(%esp)
 804cdc7:    8b 55 44                 mov    0x44(%ebp),%edx
 804cdca:    21 c6                    and    %eax,%esi
 804cdcc:    89 54 24 3c              mov    %edx,0x3c(%esp)
 804cdd0:    66 8b 14 4a              mov    (%edx,%ecx,2),%dx ### 1.6 ms
 804cdd4:    66 89 54 24 3a           mov    %dx,0x3a(%esp)
 804cdd9:    8b 54 24 40              mov    0x40(%esp),%edx
 804cddd:    66 8b 5c 24 3a           mov    0x3a(%esp),%bx ### 4.5 ms
 804cde2:    66 89 1c 72              mov    %bx,(%edx,%esi,2)
 804cde6:    8b 54 24 3c              mov    0x3c(%esp),%edx
 804cdea:    8b 5c 24 34              mov    0x34(%esp),%ebx
 804cdee:    66 89 04 4a              mov    %ax,(%edx,%ecx,2)
 804cdf2:    8b 54 24 2c              mov    0x2c(%esp),%edx
 804cdf6:    8b 4c 24 30              mov    0x30(%esp),%ecx
 804cdfa:    89 7d 78                 mov    %edi,0x78(%ebp)
 804cdfd:    8d 43 01                 lea    0x1(%ebx),%eax
 ### }
 804ce00:    83 c7 ff                 add    $0xffffffff,%edi
 804ce03:    39 c1                    cmp    %eax,%ecx
 804ce05:    75 89                    jne    804cd90 <deflate_slow+0x340>
 ### } while (--s->prev_length != 0);

The source was compiled with -Ofast -m32 -march=slm -mfpmath=sse -fPIE.

The problem seems to be in the registers allocator which causes clang to
generate more mov's accessing memory than gcc does.

Attached are the files 'zlib_test.cpp' with the source to reproduce performance
difference and 'Makefile' for this source alongside with two annotated
assemblies.

Also you need to download zlib 1.2.8 here: http://zlib.net/zlib-1.2.8.tar.gz
and place the folder named 'zlib' at the same folder with 'Makefile' and
'zlib_test.cpp'. Then fix compiler paths in the Makefile, run `make` and use
`time` to see the difference. Using the pre-compiled system-installed zlib will
not demonstrate the issue, zlib needs to be recompiled by the compiler under
examination.

To build the reproducer, invoke `make` or `make COMPILER=gcc`. You'll get the
file ./main.gcc or ./main.clang and a corresponding .s file with disassembly of
the function deflate_slow. The relevant code in the assembly is around the
second 'shl' instruction for both gcc and clang.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20150924/19d4bc79/attachment-0001.html>