[llvm-dev] Rather poor code optimisation of current clang/LLVM targeting Intel x86 (both -64 and -32)
Stefan Kanthak via llvm-dev
llvm-dev at lists.llvm.org
Tue Nov 6 07:33:38 PST 2018
Hi @ll,
while clang/LLVM recognizes common bit-twiddling idioms/expressions
like
unsigned int rotate(unsigned int x, unsigned int n)
{
return (x << n) | (x >> (32 - n));
}
and typically generates "rotate" machine instructions for this
expression, it fails to recognize other also common bit-twiddling
idioms/expressions.
The standard IEEE CRC-32 for "big endian" alias "network" byte order
(see <https://tools.ietf.org/html/rfc1952#section-8> for example):
unsigned int crc32be(unsigned char const *octets, unsigned int count)
{
unsigned int crc = 0L;
unsigned int i;
while (count--) {
crc ^= *octets++ << 24;
for (i = 8; i > 0; i--)
if (crc & 0x80000000L) // the code generated
crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from
else // these 4 lines is
crc <<= 1; // rather poor!
}
return crc;
}
The same function for "little endian" byte order, using the "inverse"
or "mirrored" polynom:
unsigned int crc32le(unsigned char const *octets, unsigned int count)
{
unsigned int crc = ~0L;
unsigned int i;
while (count--) {
crc ^= *octets++;
for (i = 8; i > 0; i--)
if (crc & 1L) // the code generated
crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from
else // these 4 lines is
crc >>= 1; // rather poor!
}
return ~crc;
}
See <https://godbolt.org/z/eYJeWt> (-O1) and <https://godbolt.org/z/zeExHm> (-O2)
crc32be: # @crc32be
xor eax, eax
test esi, esi
jne .LBB0_2
jmp .LBB0_5
.LBB0_4: # in Loop: Header=BB0_2 Depth=1
add rdi, 1
test esi, esi
je .LBB0_5
.LBB0_2: # =>This Loop Header: Depth=1
add esi, -1
movzx edx, byte ptr [rdi]
shl edx, 24
xor edx, eax
mov ecx, -8
mov eax, edx
.LBB0_3: # Parent Loop BB0_2 Depth=1 | # 4 instructions instead of 6, r8 not clobbered!
lea r8d, [rax + rax] | add eax, eax
mov edx, r8d | # CF is set from the MSB of EAX
xor edx, -306674912 | sbb edx, edx
test eax, eax | # EDX is 0xFFFFFFFF if CF set, else 0
mov eax, edx | and edx, -306674912
cmovns eax, r8d | xor eax, edx
add ecx, 1
jne .LBB0_3
jmp .LBB0_4
.LBB0_5:
ret
crc32le: # @crc32le
test esi, esi
je .LBB1_1
mov eax, -1
.LBB1_4: # =>This Loop Header: Depth=1
add esi, -1
movzx ecx, byte ptr [rdi]
xor eax, ecx
mov r8d, -8
.LBB1_5: # Parent Loop BB1_4 Depth=1 | # 4 instructions instead of 7, and
mov edx, eax | # neither r8 nor rcx clobbered!
shr edx | shr eax, 1
mov ecx, edx | # CF is set from the LSB of EAX
xor ecx, 79764919 | sbb edx, edx
test al, 1 | # EDX is 0xFFFFFFFF if CF set, else 0
mov eax, ecx | and edx, 79764919
cmove eax, edx | xor eax, edx
add r8d, 1
jne .LBB1_5
add rdi, 1
test esi, esi
jne .LBB1_4
not eax
ret
.LBB1_1:
xor eax, eax
ret
JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal
code sequence with 6 and 7 instructions; this accounts for a total
of 16 and 24 superfluous instructions respectively.
More information about the llvm-dev
mailing list