<div dir="ltr"><div>IIUC, you want to use x86-specific bit-hacks (sbb masking) in cases like this:</div>unsigned int foo(unsigned int crc) {<br>    if (crc & 0x80000000)<br>      crc <<= 1, crc ^= 0xEDB88320;<br>    else<br>      crc <<= 1;<br>    return crc;<br><div>}</div><div><br></div><div>Which is this in LLVM IR:</div><div><div style="color:rgb(0,0,0);background-color:rgb(255,255,254)">define i32 @foo(i32 %x) {<br>  %t2 = icmp slt i32 %x, 0 <br>  %t3 = shl i32 %x, 1<br>  %t4 = xor i32 %t3, -306674912<br>  %t5 = select i1 %t2, i32 %t4, i32 %t3<br>  ret i32 %t5<br>}<br></div></div><div><br></div><div>Please a file a bug report for the x86 backend (including performance numbers if you have that data).<br></div></div><br><div class="gmail_quote"><div dir="ltr">On Wed, Nov 7, 2018 at 5:24 PM Stefan Kanthak via llvm-dev <<a href="mailto:llvm-dev@lists.llvm.org">llvm-dev@lists.llvm.org</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Hi @ll,<br>


<br>


while clang/LLVM recognizes common bit-twiddling idioms/expressions<br>


like<br>


<br>


unsigned int rotate(unsigned int x, unsigned int n)<br>


{<br>


    return (x << n) | (x >> (32 - n));<br>


}<br>


<br>


and typically generates "rotate" machine instructions for this<br>


expression, it fails to recognize other also common bit-twiddling<br>


idioms/expressions.<br>


<br>


The standard IEEE CRC-32 for "big endian" alias "network" byte order<br>


(see <<a href="https://tools.ietf.org/html/rfc1952#section-8" rel="noreferrer" target="_blank">https://tools.ietf.org/html/rfc1952#section-8</a>> for example):<br>


<br>


unsigned int crc32be(unsigned char const *octets, unsigned int count)<br>


{<br>


    unsigned int crc = 0L;<br>


    unsigned int i;<br>


<br>


    while (count--) {<br>


        crc ^= *octets++ << 24;<br>


        for (i = 8; i > 0; i--)<br>


            if (crc & 0x80000000L)             // the code generated<br>


                crc <<= 1, crc ^= 0xEDB88320L; // for Intel x86 from<br>


            else                               // these 4 lines is<br>


                crc <<= 1;                     // rather poor!<br>


    }<br>


    return crc;<br>


}<br>


<br>


The same function for "little endian" byte order, using the "inverse"<br>


or "mirrored" polynom:<br>


<br>


unsigned int crc32le(unsigned char const *octets, unsigned int count)<br>


{<br>


    unsigned int crc = ~0L;<br>


    unsigned int i;<br>


<br>


    while (count--) {<br>


        crc ^= *octets++;<br>


        for (i = 8; i > 0; i--)<br>


            if (crc & 1L)                      // the code generated<br>


                crc >>= 1, crc ^= 0x04C11DB7L; // for Intel x86 from<br>


            else                               // these 4 lines is<br>


                crc >>= 1;                     // rather poor!<br>


    }<br>


    return ~crc;<br>


}<br>


<br>


See <<a href="https://godbolt.org/z/eYJeWt" rel="noreferrer" target="_blank">https://godbolt.org/z/eYJeWt</a>> (-O1) and <<a href="https://godbolt.org/z/zeExHm" rel="noreferrer" target="_blank">https://godbolt.org/z/zeExHm</a>> (-O2)<br>


<br>


crc32be: # @crc32be<br>


        xor    eax, eax<br>


        test   esi, esi<br>


        jne    .LBB0_2<br>


        jmp    .LBB0_5<br>


.LBB0_4: # in Loop: Header=BB0_2 Depth=1<br>


        add    rdi, 1<br>


        test   esi, esi<br>


        je    .LBB0_5<br>


.LBB0_2: # =>This Loop Header: Depth=1<br>


        add    esi, -1<br>


        movzx  edx, byte ptr [rdi]<br>


        shl    edx, 24<br>


        xor    edx, eax<br>


        mov    ecx, -8<br>


        mov    eax, edx<br>


.LBB0_3: # Parent Loop BB0_2 Depth=1   | # 4 instructions instead of 6, r8 not clobbered!<br>


        lea    r8d, [rax + rax]        |     add   eax, eax<br>


        mov    edx, r8d                | # CF is set from the MSB of EAX<br>


        xor    edx, -306674912         |     sbb   edx, edx<br>


        test   eax, eax                | # EDX is 0xFFFFFFFF if CF set, else 0<br>


        mov    eax, edx                |     and   edx, -306674912<br>


        cmovns eax, r8d                |     xor   eax, edx<br>


        add    ecx, 1<br>


        jne    .LBB0_3<br>


        jmp    .LBB0_4<br>


.LBB0_5:<br>


        ret<br>


crc32le: # @crc32le<br>


        test   esi, esi<br>


        je    .LBB1_1<br>


        mov    eax, -1<br>


.LBB1_4: # =>This Loop Header: Depth=1<br>


        add    esi, -1<br>


        movzx  ecx, byte ptr [rdi]<br>


        xor    eax, ecx<br>


        mov    r8d, -8<br>


.LBB1_5: # Parent Loop BB1_4 Depth=1   | # 4 instructions instead of 7, and<br>


        mov    edx, eax                | #  neither r8 nor rcx clobbered!<br>


        shr    edx                     |     shr   eax, 1<br>


        mov    ecx, edx                | # CF is set from the LSB of EAX<br>


        xor    ecx, 79764919           |     sbb   edx, edx<br>


        test   al, 1                   | # EDX is 0xFFFFFFFF if CF set, else 0<br>


        mov    eax, ecx                |     and   edx, 79764919<br>


        cmove  eax, edx                |     xor   eax, edx<br>


        add    r8d, 1<br>


        jne    .LBB1_5<br>


        add    rdi, 1<br>


        test   esi, esi<br>


        jne    .LBB1_4<br>


        not    eax<br>


        ret<br>


.LBB1_1:<br>


        xor    eax, eax<br>


        ret<br>


<br>


JFTR: with -O2, the inner loop gets unrolled, using the same non-optimal<br>


      code sequence with 6 and 7 instructions; this accounts for a total<br>


      of 16 and 24 superfluous instructions respectively.<br>


_______________________________________________<br>


LLVM Developers mailing list<br>


<a href="mailto:llvm-dev@lists.llvm.org" target="_blank">llvm-dev@lists.llvm.org</a><br>


<a href="http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev" rel="noreferrer" target="_blank">http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-dev</a><br>


</blockquote></div>