[llvm-bugs] [Bug 37112] New: Missed optimization: many "switch-like" IFs could use bit-array instead of jump table
via llvm-bugs
llvm-bugs at lists.llvm.org
Thu Apr 12 10:53:39 PDT 2018
https://bugs.llvm.org/show_bug.cgi?id=37112
Bug ID: 37112
Summary: Missed optimization: many "switch-like" IFs could use
bit-array instead of jump table
Product: clang
Version: trunk
Hardware: PC
OS: Linux
Status: NEW
Severity: enhancement
Priority: P
Component: -New Bugs
Assignee: unassignedclangbugs at nondot.org
Reporter: kobalicek.petr at gmail.com
CC: llvm-bugs at lists.llvm.org
Problem
-------
I'm comparing how various compilers optimize the following code:
bool testChar(unsigned x) noexcept {
return x == A ? true :
x == B ? true :
x == C ? true :
x == D ? true :
... (continues) ...
x == Z ? true : false
}
And I'm wondering why clang is using a jump table for such case, which turns
out to be suboptimal when the range for `true` or `false` cases is very
restricted, let's say to 0-255 (byte) or even smaller range.
Testing Code
------------
I wrote the following code to test a naive implementation and an implementation
that uses bit-array trick:
#include <stdint.h>
#include <stdlib.h>
#include <type_traits>
// BitArray builders, no idea how to do this better, this works...
template<typename Predicate, uint32_t Index>
struct BinaryLUT_ByteMask {
enum : uint32_t {
kValue = (uint32_t(Predicate::test(Index + 0)) << 0)
| (uint32_t(Predicate::test(Index + 1)) << 1)
| (uint32_t(Predicate::test(Index + 2)) << 2)
| (uint32_t(Predicate::test(Index + 3)) << 3)
| (uint32_t(Predicate::test(Index + 4)) << 4)
| (uint32_t(Predicate::test(Index + 5)) << 5)
| (uint32_t(Predicate::test(Index + 6)) << 6)
| (uint32_t(Predicate::test(Index + 7)) << 7)
};
};
template<typename Predicate, uint32_t Index>
struct BinaryLUT_UInt32Mask {
enum : uint32_t {
kValue = ((BinaryLUT_ByteMask<Predicate, Index + 0>::kValue) << 0)
| ((BinaryLUT_ByteMask<Predicate, Index + 8>::kValue) << 8)
| ((BinaryLUT_ByteMask<Predicate, Index + 16>::kValue) << 16)
| ((BinaryLUT_ByteMask<Predicate, Index + 24>::kValue) << 24)
};
};
template<typename Predicate>
struct BinaryLUT {
// Just to prevent having multiple tables emitted in
// case a different type for `Index` was used, not
// sure this is really required...
static inline bool testInternal(size_t index) noexcept {
static const uint32_t lut[256 / 32] = {
BinaryLUT_UInt32Mask<Predicate, 32 * 0>::kValue,
BinaryLUT_UInt32Mask<Predicate, 32 * 1>::kValue,
BinaryLUT_UInt32Mask<Predicate, 32 * 2>::kValue,
BinaryLUT_UInt32Mask<Predicate, 32 * 3>::kValue,
BinaryLUT_UInt32Mask<Predicate, 32 * 4>::kValue,
BinaryLUT_UInt32Mask<Predicate, 32 * 5>::kValue,
BinaryLUT_UInt32Mask<Predicate, 32 * 6>::kValue,
BinaryLUT_UInt32Mask<Predicate, 32 * 7>::kValue
};
return bool((lut[index / 32U] >> (index % 32U)) & 0x1);
}
template<typename T>
static inline bool test(const T& value) noexcept {
typedef typename std::make_unsigned<T>::type U;
return testInternal(size_t(U(value)));
}
};
struct AllowedPostScriptChars {
static constexpr bool test(uint32_t i) noexcept {
return i < 33 ? false :
i > 126 ? false :
i == '[' ? false :
i == ']' ? false :
i == '(' ? false :
i == ')' ? false :
i == '{' ? false :
i == '}' ? false :
i == '<' ? false :
i == '>' ? false :
i == '/' ? false :
i == '%' ? false : true;
}
};
bool testNaive(uint32_t uc) {
return AllowedPostScriptChars::test(uc);
}
bool testLUT(uint32_t uc) {
return BinaryLUT<AllowedPostScriptChars>::test(uc);
}
testNaive:
lea eax, [rdi - 33]
cmp eax, 93
jbe .LBB1_2
xor eax, eax
ret
.LBB1_2:
add edi, -37
cmp edi, 88
ja .LBB1_4
xor eax, eax
jmp qword ptr [8*rdi + .LJTI1_0]
.LBB1_5:
ret
.LBB1_4:
mov al, 1
ret
.LJTI1_0:
!!! (... skipped almost 90 entries in a jump-table ...) !!!
testLUT:
mov eax, edi
shr eax, 5
mov eax, dword ptr [4*rax +
BinaryLUT<AllowedPostScriptChars>::testInternal(unsigned long)::lut]
bt eax, edi
setb al
ret
BinaryLUT<AllowedPostScriptChars>::testInternal(unsigned long)::lut: # The
actual LUT generated.
.long 0 # 0x0
.long 2952756446 # 0xafff7cde
.long 3623878655 # 0xd7ffffff
.long 1476395007 # 0x57ffffff
.long 0 # 0x0
.long 0 # 0x0
.long 0 # 0x0
.long 0 # 0x0
Notes
-----
I would like to mention that I know that the two functions are not completely
identical. The `testLUT` function does not do range checks as I have a
restricted input from 0..255 so I didn't bother with that, but I think that
this fact should not really matter as I'm talking about the approach and not
range checking. In addition, I think that even my optimization is not really
optimal in cases when there would be a lot of zeros in the table (maybe range
check would be better, and also the mask could be 64-bit long on 64-bit
platforms, I didn't do this for simplicity).
Strangely enough, if I restrict the input to `uint8_t` clang doesn't generate
jump-table anymore, it would generate the following code instead:
testNaive:
mov eax, edi
add al, -33
cmp al, 94
setb al
cmp dil, 91
setne cl
cmp dil, 93
setne dl
and dl, cl
and dl, al
mov eax, edi
or al, 1
cmp al, 41
setne al
cmp dil, 123
setne cl
and cl, al
and cl, dl
cmp dil, 125
setne al
mov edx, edi
or dl, 2
cmp dl, 62
setne dl
and dl, al
cmp dil, 47
setne al
cmp dil, 37
setne sil
and al, dl
and al, cl
and al, sil
ret
Which is kinda strange as the first two conditions basically restrict the input
into the same range. I personally think that even this code is not really good
and bit-array trick would be much better.
For comparison, this is a GCC approach:
testNaive(unsigned int):
lea edx, [rdi-33]
xor eax, eax
cmp dl, 93
ja .L7
mov edx, edi
and edx, -33
sub edx, 91
and edx, 253
je .L7
mov eax, 1
cmp dil, 62
ja .L7
movabs rax, 5764751696496427008
shrx rax, rax, rdi
not rax
and eax, 1
.L7:
ret
GCC seems to figure out the pattern and uses a bit-mask trick as well, I would
say it's a bit better than what I can do with templates, but the code is of
course longer as it does range checks and the bit-array doesn't fit into a
single uint64_t so the extra cases are emitted as code.
--
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20180412/26675e2d/attachment-0001.html>
More information about the llvm-bugs
mailing list