[llvm-bugs] [Bug 37112] New: Missed optimization: many "switch-like" IFs could use bit-array instead of jump table

via llvm-bugs llvm-bugs at lists.llvm.org
Thu Apr 12 10:53:39 PDT 2018


https://bugs.llvm.org/show_bug.cgi?id=37112

            Bug ID: 37112
           Summary: Missed optimization: many "switch-like" IFs could use
                    bit-array instead of jump table
           Product: clang
           Version: trunk
          Hardware: PC
                OS: Linux
            Status: NEW
          Severity: enhancement
          Priority: P
         Component: -New Bugs
          Assignee: unassignedclangbugs at nondot.org
          Reporter: kobalicek.petr at gmail.com
                CC: llvm-bugs at lists.llvm.org

Problem
-------

I'm comparing how various compilers optimize the following code:

bool testChar(unsigned x) noexcept {
  return x == A ? true :
         x == B ? true :
         x == C ? true :
         x == D ? true :
         ... (continues) ...   
         x == Z ? true : false
}

And I'm wondering why clang is using a jump table for such case, which turns
out to be suboptimal when the range for `true` or `false` cases is very
restricted, let's say to 0-255 (byte) or even smaller range.


Testing Code
------------

I wrote the following code to test a naive implementation and an implementation
that uses bit-array trick:

#include <stdint.h>
#include <stdlib.h>
#include <type_traits>

// BitArray builders, no idea how to do this better, this works...
template<typename Predicate, uint32_t Index>
struct BinaryLUT_ByteMask { 
  enum : uint32_t {
    kValue = (uint32_t(Predicate::test(Index + 0)) << 0)
           | (uint32_t(Predicate::test(Index + 1)) << 1)
           | (uint32_t(Predicate::test(Index + 2)) << 2)
           | (uint32_t(Predicate::test(Index + 3)) << 3)
           | (uint32_t(Predicate::test(Index + 4)) << 4)
           | (uint32_t(Predicate::test(Index + 5)) << 5)
           | (uint32_t(Predicate::test(Index + 6)) << 6)
           | (uint32_t(Predicate::test(Index + 7)) << 7)
  };
};

template<typename Predicate, uint32_t Index>
struct BinaryLUT_UInt32Mask {
  enum : uint32_t {
    kValue = ((BinaryLUT_ByteMask<Predicate, Index +  0>::kValue) <<  0)
           | ((BinaryLUT_ByteMask<Predicate, Index +  8>::kValue) <<  8)
           | ((BinaryLUT_ByteMask<Predicate, Index + 16>::kValue) << 16)
           | ((BinaryLUT_ByteMask<Predicate, Index + 24>::kValue) << 24)
  };
};

template<typename Predicate>
struct BinaryLUT {
  // Just to prevent having multiple tables emitted in
  // case a different type for `Index` was used, not
  // sure this is really required...
  static inline bool testInternal(size_t index) noexcept {
    static const uint32_t lut[256 / 32] = {
      BinaryLUT_UInt32Mask<Predicate, 32 * 0>::kValue,
      BinaryLUT_UInt32Mask<Predicate, 32 * 1>::kValue,
      BinaryLUT_UInt32Mask<Predicate, 32 * 2>::kValue,
      BinaryLUT_UInt32Mask<Predicate, 32 * 3>::kValue,
      BinaryLUT_UInt32Mask<Predicate, 32 * 4>::kValue,
      BinaryLUT_UInt32Mask<Predicate, 32 * 5>::kValue,
      BinaryLUT_UInt32Mask<Predicate, 32 * 6>::kValue,
      BinaryLUT_UInt32Mask<Predicate, 32 * 7>::kValue
    };
    return bool((lut[index / 32U] >> (index % 32U)) & 0x1);
  }

  template<typename T>
  static inline bool test(const T& value) noexcept {
    typedef typename std::make_unsigned<T>::type U;    
    return testInternal(size_t(U(value)));
  }
};

struct AllowedPostScriptChars {
  static constexpr bool test(uint32_t i) noexcept {
    return i <  33  ? false :
           i >  126 ? false :
           i == '[' ? false :
           i == ']' ? false :
           i == '(' ? false :
           i == ')' ? false :
           i == '{' ? false :
           i == '}' ? false :
           i == '<' ? false :
           i == '>' ? false :
           i == '/' ? false :
           i == '%' ? false : true;
  }
};

bool testNaive(uint32_t uc) {
  return AllowedPostScriptChars::test(uc);
}

bool testLUT(uint32_t uc) {
  return BinaryLUT<AllowedPostScriptChars>::test(uc);
}


testNaive:
  lea eax, [rdi - 33]
  cmp eax, 93
  jbe .LBB1_2
  xor eax, eax
  ret
.LBB1_2:
  add edi, -37
  cmp edi, 88
  ja .LBB1_4
  xor eax, eax
  jmp qword ptr [8*rdi + .LJTI1_0]
.LBB1_5:
  ret
.LBB1_4:
  mov al, 1
  ret
.LJTI1_0:
  !!! (... skipped almost 90 entries in a jump-table ...) !!!


testLUT:
  mov eax, edi
  shr eax, 5
  mov eax, dword ptr [4*rax +
BinaryLUT<AllowedPostScriptChars>::testInternal(unsigned long)::lut]
  bt eax, edi
  setb al
  ret
BinaryLUT<AllowedPostScriptChars>::testInternal(unsigned long)::lut: # The
actual LUT generated.
  .long 0 # 0x0
  .long 2952756446 # 0xafff7cde
  .long 3623878655 # 0xd7ffffff
  .long 1476395007 # 0x57ffffff
  .long 0 # 0x0
  .long 0 # 0x0
  .long 0 # 0x0
  .long 0 # 0x0


Notes
-----

I would like to mention that I know that the two functions are not completely
identical. The `testLUT` function does not do range checks as I have a
restricted input from 0..255 so I didn't bother with that, but I think that
this fact should not really matter as I'm talking about the approach and not
range checking. In addition, I think that even my optimization is not really
optimal in cases when there would be a lot of zeros in the table (maybe range
check would be better, and also the mask could be 64-bit long on 64-bit
platforms, I didn't do this for simplicity).

Strangely enough, if I restrict the input to `uint8_t` clang doesn't generate
jump-table anymore, it would generate the following code instead:

testNaive:
  mov eax, edi
  add al, -33
  cmp al, 94
  setb al
  cmp dil, 91
  setne cl
  cmp dil, 93
  setne dl
  and dl, cl
  and dl, al
  mov eax, edi
  or al, 1
  cmp al, 41
  setne al
  cmp dil, 123
  setne cl
  and cl, al
  and cl, dl
  cmp dil, 125
  setne al
  mov edx, edi
  or dl, 2
  cmp dl, 62
  setne dl
  and dl, al
  cmp dil, 47
  setne al
  cmp dil, 37
  setne sil
  and al, dl
  and al, cl
  and al, sil
  ret

Which is kinda strange as the first two conditions basically restrict the input
into the same range. I personally think that even this code is not really good
and bit-array trick would be much better.

For comparison, this is a GCC approach:

testNaive(unsigned int):
        lea     edx, [rdi-33]
        xor     eax, eax
        cmp     dl, 93
        ja      .L7
        mov     edx, edi
        and     edx, -33
        sub     edx, 91
        and     edx, 253
        je      .L7
        mov     eax, 1
        cmp     dil, 62
        ja      .L7
        movabs  rax, 5764751696496427008
        shrx    rax, rax, rdi
        not     rax
        and     eax, 1
.L7:
        ret

GCC seems to figure out the pattern and uses a bit-mask trick as well, I would
say it's a bit better than what I can do with templates, but the code is of
course longer as it does range checks and the bit-array doesn't fit into a
single uint64_t so the extra cases are emitted as code.

-- 
You are receiving this mail because:
You are on the CC list for the bug.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-bugs/attachments/20180412/26675e2d/attachment-0001.html>


More information about the llvm-bugs mailing list