[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Wed Nov 28 07:17:02 PST 2018

On Wed, Nov 28, 2018 at 7:11 AM Sanjay Patel via llvm-dev <
llvm-dev at lists.llvm.org> wrote:

> Thanks for reporting this and other perf opportunities. As I mentioned
> before, if you could file bug reports for these, that's probably the only
> way they're ever going to get fixed (unless you're planning to fix them
> yourself). It's not an ideal situation, but that's how this project works
> in my experience.
>
> Someone filed 1 of your examples on your behalf here:
> https://bugs.llvm.org/show_bug.cgi?id=39793
>
> Please do consider following that model for your other problems. Without
> that kind of report, your code examples are likely to be forgotten.
>
> Finally, sending mail to llvm-bugs is practically useless. I don't know of
> anyone that is tracking mails to that list rather than actual bug reports.
>

The list is setup to not accept email, basically - I'm one of the
moderators there & any email sent there that doesn't come from the bug
database will get automatically or manually rejected with the
suggestion/request that the bug database is used instead.

>
> On Tue, Nov 27, 2018 at 4:37 PM Stefan Kanthak <stefan.kanthak at nexgo.de>
> wrote:
>
>> "Sanjay Patel" <spatel at rotateright.com> wrote:
>>
>> > IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like
>> > this:
>> > unsigned int foo(unsigned int crc) {
>> >    if (crc & 0x80000000)
>> >      crc <<= 1, crc ^= 0xEDB88320;
>> >    else
>> >      crc <<= 1;
>> >    return crc;
>> > }
>>
>> To document this for x86 too: rewrite the function slightly
>>
>> unsigned int foo(unsigned int crc, unsigned int poly) {
>>     if (crc & 0x80000000)
>>         crc <<= 1, crc ^= poly;
>>     else
>>         crc <<= 1;
>>     return crc;
>> }
>>
>> unsigned int bar(unsigned int crc, unsigned int poly) {
>>     if (crc & 1)
>>         crc >>= 1, crc ^= poly;
>>     else
>>         crc >>= 1;
>>     return crc;
>> }
>>
>> and you get the perfect code for the left-shifting case!
>>
>> foo: # @foo
>>     lea eax, [rdi + rdi]
>>     sar edi, 31
>>     and edi, esi
>>     xor eax, edi
>>     ret
>>
>> The right-shifting case leaves but still room for improvement!
>>
>> bar: # @bar                | bar: # @bar
>>     mov eax, edi           |
>>     and eax, 1             |
>>     neg eax                |
>>     shr edi                |     shr edi
>>                            |     sbb eax, eax
>>     and eax, esi           |     and eax, esi
>>     xor eax, edi           |     xor eax, edi
>>     ret                    |     ret
>>
>> See <https://godbolt.org/z/aPKweG>
>>
>> regards
>> Stefan Kanthak
>>
>> > Which is this in LLVM IR:
>> > define i32 @foo(i32 %x) {
>> >  %t2 = icmp slt i32 %x, 0
>> >  %t3 = shl i32 %x, 1
>> >  %t4 = xor i32 %t3, -306674912
>> >  %t5 = select i1 %t2, i32 %t4, i32 %t3
>> >  ret i32 %t5
>> > }
>> >
>> > Please a file a bug report for the x86 backend (including performance
>> > numbers if you have that data).
>> >
>> > On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <
>> > llvm-dev at lists.llvm.org> wrote:
>> >
>> >> Hi @ll,
>> >>
>> >> while clang/LLVM recognizes common bit-twiddling idioms/expressions
>> >> like
>> >>
>> >> unsigned int rotate(unsigned int x, unsigned int n)
>> >> {
>> >>     return (x << n) | (x >> (32 - n));
>> >> }
>> >>
>> >> and typically generates "rotate" machine instructions for this
>> >> expression, it fails to recognize other also common bit-twiddling
>> >> idioms/expressions.
>> >>
>> >> The standard IEEE CRC-32 for "big endian" alias "network" byte order
>> >> (see <https://tools.ietf.org/html/rfc1952#section-8> for example):
>> >>
>> >> unsigned int crc32be(unsigned char const *octets, unsigned int count)
>> >> {
>> >>     unsigned int crc = 0L;
>> >>     unsigned int i;
>> >>
>> >>     while (count--) {
>> >>         crc ^= *octets++ << 24;
>> >>         for (i = 8; i > 0; i--)
>> >>             if (crc & 0x80000000L)             // the code generated
>> >>                 crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
>> >>             else                               // these 4 lines is
>> >>                 crc <<= 1;                     // rather poor!
>> >>     }
>> >>     return crc;
>> >> }
>> >>
>> >> The same function for "little endian" byte order, using the "inverse"
>> >> or "mirrored" polynom:
>> >>
>> >> unsigned int crc32le(unsigned char const *octets, unsigned int count)
>> >> {
>> >>     unsigned int crc = ~0L;
>> >>     unsigned int i;
>> >>
>> >>     while (count--) {
>> >>         crc ^= *octets++;
>> >>         for (i = 8; i > 0; i--)
>> >>             if (crc & 1L)                      // the code generated
>> >>                 crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
>> >>             else                               // these 4 lines is
>> >>                 crc >>= 1;                     // rather poor!
>> >>     }
>> >>     return ~crc;
>> >> }
>> >>
>> >> See <https://godbolt.org/z/eYJeWt> (-O1) and <
>> https://godbolt.org/z/zeExHm>
>> >> (-O2)
>> >>
>> >> crc32be: # @crc32be
>> >>         xor    eax, eax
>> >>         test   esi, esi
>> >>         jne    .LBB0_2
>> >>         jmp    .LBB0_5
>> >> .LBB0_4: # in Loop: Header=BB0_2 Depth=1
>> >>         add    rdi, 1
>> >>         test   esi, esi
>> >>         je    .LBB0_5
>> >> .LBB0_2: # =>This Loop Header: Depth=1
>> >>         add    esi, -1
>> >>         movzx  edx, byte ptr [rdi]
>> >>         shl    edx, 24
>> >>         xor    edx, eax
>> >>         mov    ecx, -8
>> >>         mov    eax, edx
>> >> .LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of
>> 6, r8
>> >> not clobbered!
>> >>         lea    r8d, [rax + rax]        |     add   eax, eax
>> >>         mov    edx, r8d                | # CF is set from the MSB of
>> EAX
>> >>         xor    edx, -306674912         |     sbb   edx, edx
>> >>         test   eax, eax                | # EDX is 0xFFFFFFFF if CF set,
>> >> else 0
>> >>         mov    eax, edx                |     and   edx, -306674912
>> >>         cmovns eax, r8d                |     xor   eax, edx
>> >>         add    ecx, 1
>> >>         jne    .LBB0_3
>> >>         jmp    .LBB0_4
>> >> .LBB0_5:
>> >>         ret
>> >> crc32le: # @crc32le
>> >>         test   esi, esi
>> >>         je    .LBB1_1
>> >>         mov    eax, -1
>> >> .LBB1_4: # =>This Loop Header: Depth=1
>> >>         add    esi, -1
>> >>         movzx  ecx, byte ptr [rdi]
>> >>         xor    eax, ecx
>> >>         mov    r8d, -8
>> >> .LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of
>> 7, and
>> >>         mov    edx, eax                | #  neither r8 nor rcx
>> clobbered!
>> >>         shr    edx                     |     shr   eax, 1
>> >>         mov    ecx, edx                | # CF is set from the LSB of
>> EAX
>> >>         xor    ecx, 79764919           |     sbb   edx, edx
>> >>         test   al, 1                   | # EDX is 0xFFFFFFFF if CF set,
>> >> else 0
>> >>         mov    eax, ecx                |     and   edx, 79764919
>> >>         cmove  eax, edx                |     xor   eax, edx
>> >>         add    r8d, 1
>> >>         jne    .LBB1_5
>> >>         add    rdi, 1
>> >>         test   esi, esi
>> >>         jne    .LBB1_4
>> >>         not    eax
>> >>         ret
>> >> .LBB1_1:
>> >>         xor    eax, eax
>> >>         ret
>> >>
>> >> JFTR: with -O2, the inner loop gets unrolled, using the same
>> non-optimal
>> >>       code sequence with 6 and 7 instructions; this accounts for a
>> total
>> >>       of 16 and 24 superfluous instructions respectively.
>> >> _______________________________________________
>> >> LLVM Developers mailing list
>> >> llvm-dev at lists.llvm.org
>> >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>> >>
>> >
>>
> _______________________________________________
> LLVM Developers mailing list
> llvm-dev at lists.llvm.org
> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20181128/30e5f318/attachment-0001.html>