[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)

Tue Nov 6 07:33:38 PST 2018

Hi @ll,

while clang/LLVM recognizes common bit-twiddling idioms/expressions
like

unsigned int rotate(unsigned int x, unsigned int n)
{
    return (x << n) | (x >> (32 - n));
}

and typically generates "rotate" machine instructions for this
expression, it fails to recognize other also common bit-twiddling
idioms/expressions.

The standard IEEE CRC-32 for "big endian" alias "network" byte order
(see <https://tools.ietf.org/html/rfc1952#section-8> for example):

unsigned int crc32be(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = 0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++ << 24;
        for (i = 8; i > 0; i--)
            if (crc & 0x80000000L)             // the code generated
                crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
            else                               // these 4 lines is
                crc <<= 1;                     // rather poor!
    }
    return crc;
}

The same function for "little endian" byte order, using the "inverse"
or "mirrored" polynom:

unsigned int crc32le(unsigned char const *octets, unsigned int count)
{
    unsigned int crc = ~0L;
    unsigned int i;

    while (count--) {
        crc ^= *octets++;
        for (i = 8; i > 0; i--)
            if (crc & 1L)                      // the code generated
                crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
            else                               // these 4 lines is
                crc >>= 1;                     // rather poor!
    }
    return ~crc;
}

See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> (-O2)

crc32be: # @crc32be
        xor    eax, eax
        test   esi, esi
        jne    .LBB0_2
        jmp    .LBB0_5
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
        add    rdi, 1
        test   esi, esi
        je    .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  edx, byte ptr [rdi]
        shl    edx, 24
        xor    edx, eax
        mov    ecx, -8
        mov    eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8 not clobbered!
        lea    r8d, [rax + rax]        |     add   eax, eax
        mov    edx, r8d                | # CF is set from the MSB of EAX
        xor    edx, -306674912         |     sbb   edx, edx
        test   eax, eax                | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, edx                |     and   edx, -306674912
        cmovns eax, r8d                |     xor   eax, edx
        add    ecx, 1
        jne    .LBB0_3
        jmp    .LBB0_4
.LBB0_5:
        ret
crc32le: # @crc32le
        test   esi, esi
        je    .LBB1_1
        mov    eax, -1
.LBB1_4: # =>This Loop Header: Depth=1
        add    esi, -1
        movzx  ecx, byte ptr [rdi]
        xor    eax, ecx
        mov    r8d, -8
.LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and
        mov    edx, eax                | #  neither r8 nor rcx clobbered!
        shr    edx                     |     shr   eax, 1
        mov    ecx, edx                | # CF is set from the LSB of EAX
        xor    ecx, 79764919           |     sbb   edx, edx
        test   al, 1                   | # EDX is 0xFFFFFFFF if CF set, else 0
        mov    eax, ecx                |     and   edx, 79764919
        cmove  eax, edx                |     xor   eax, edx
        add    r8d, 1
        jne    .LBB1_5
        add    rdi, 1
        test   esi, esi
        jne    .LBB1_4
        not    eax
        ret
.LBB1_1:
        xor    eax, eax
        ret

JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
      code sequence with 6 and 7 instructions; this accounts for a total
      of 16 and 24 superfluous instructions respectively.