[cfe-dev] "Optimized implementations"?

Sun Sep 6 14:09:29 PDT 2020

__builtin_parity uses setnp on older x86 and popcnt with sse4.2

On Sun, Sep 6, 2020 at 1:32 PM Stefan Kanthak <stefan.kanthak at nexgo.de>
wrote:

> "Craig Topper" <craig.topper at gmail.com> wrote;
>
>
>
> > Clang never generates calls to ___paritysi2, ___paritydi2, ___cmpdi2, or
>
> > ___ucmpdi2 on X86 so its not clear the performance of this matters at
> all.
>
>
>
> So you can safely remove them for X86 and all the other targets where such
>
> unoptimized code is never called!
>
> But fix these routines for targets where they are called.
>
>
>
> The statement does NOT make any exceptions, and it does not say
>
> | ships unoptimized routines the compiler never calls
>
> but
>
> | optimized target-independent implementations
>
>
>
> Stefan
>
>
>
> BTW: do builtins like __builtin_*parity* exist?
>
>      If yes: do they generate the same bad code?
>
>
>
> > On Sun, Sep 6, 2020 at 12:31 PM Stefan Kanthak via cfe-dev <
>
> > cfe-dev at lists.llvm.org> wrote:
>
> >
>
> >> <https://compiler-rt.llvm.org/index.html> boasts:
>
> >>
>
> >> | The builtins library provides optimized implementations of this
>
> >> | and other low-level routines, either in target-independent C form,
>
> >> | or as a heavily-optimized assembly.
>
> >>
>
> >> Really?
>
> >>
>
> >> Left: inperformant code shipped in    # Right: slightly improved code,
>
> >>       clang_rt.builtins-*             #        which the optimiser
> REALLY
>
> >>                                       #        should have generated
>
> >>
>
> >> ___cmpdi2:
>
> >>         mov     ecx, [esp+16]         #       mov     ecx, [esp+16]
>
> >>         xor     eax, eax              #       xor     eax, eax
>
> >>         cmp     [esp+8], ecx          #       cmp     ecx, [esp+8]
>
> >>         jl      @f                    #       jg      @f
>
> >>         mov     eax, 2                #       mov     eax, 2
>
> >>         jg      @f                    #       jl      @f
>
> >>         mov     ecx, [esp+4]          #
>
> >>         mov     edx, [esp+12]         #       mov     ecx, [esp+12]
>
> >>         mov     eax, 0                #       xor     eax, eax
>
> >>         cmp     ecx, edx              #       cmp     ecx, [esp+4]
>
> >>         jb      @f                    #       ja      @f
>
> >>         cmp     edx, ecx              #
>
> >>         mov     eax, 1                #
>
> >>         adc     eax, 0                #       adc     eax, 1
>
> >> @@:                                   # @@:
>
> >>         ret                           #       ret
>
> >>
>
> >>                                       # 3 instructions less, 10 bytes
> saved
>
> >>
>
> >> ___ucmpdi2:
>
> >>         mov     ecx, [esp+16]         #       mov     ecx, [esp+16]
>
> >>         xor     eax, eax              #       xor     eax, eax
>
> >>         cmp     [esp+8], ecx          #       cmp     ecx, [esp+8]
>
> >>         jb      @f                    #       ja      @f
>
> >>         mov     eax, 2                #       mov     eax, 2
>
> >>         ja      @f                    #       jb      @f
>
> >>         mov     ecx, [esp+4]          #
>
> >>         mov     edx, [esp+12]         #       mov     ecx, [esp+12]
>
> >>         mov     eax, 0                #       xor     eax, eax
>
> >>         cmp     ecx, edx              #       cmp     ecx, [esp+4]
>
> >>         jb      @f                    #       ja      @f
>
> >>         cmp     edx, ecx              #
>
> >>         mov     eax, 1                #
>
> >>         adc     eax, 0                #       adc     eax, 1
>
> >> @@:                                   # @@:
>
> >>         ret                           #       ret
>
> >>
>
> >>                                       # 3 instructions less, 10 bytes
> saved
>
> >>
>
> >>
>
> >> Now properly written code, of course branch-free, faster and shorter:
>
> >>
>
> >> # Copyright (C) 2004-2020, Stefan Kanthak <stefan.kanthak at nexgo.de>
>
> >>
>
> >> ___cmpdi2:
>
> >>         mov     ecx, [esp+4]
>
> >>         mov     edx, [esp+12]
>
> >>         cmp     ecx, edx
>
> >>         mov     eax, [esp+8]
>
> >>         sbb     eax, [esp+16]
>
> >>         setl    ah
>
> >>         cmp     edx, ecx
>
> >>         mov     edx, [esp+16]
>
> >>         sbb     edx, [esp+8]
>
> >>         setl    al
>
> >>         sub     al, ah
>
> >>         movsx   eax, al
>
> >>         inc     eax
>
> >>         ret
>
> >>
>
> >> ___ucmpdi2:
>
> >>         mov     ecx, [esp+4]
>
> >>         mov     edx, [esp+12]
>
> >>         cmp     ecx, edx
>
> >>         mov     eax, [esp+8]
>
> >>         sbb     eax, [esp+16]
>
> >>         sbb     eax, eax
>
> >>         cmp     edx, ecx
>
> >>         mov     edx, [esp+16]
>
> >>         sbb     edx, [esp+8]
>
> >>         adc     eax, 1
>
> >>         ret
>
> >>
>
> >>
>
> >> AGAIN:
>
> >> Remove every occurance of the word "optimized" on the above web page.
>
> >>
>
> >> 'nuff said
>
> >> Stefan
>
> >> _______________________________________________
>
> >> cfe-dev mailing list
>
> >> cfe-dev at lists.llvm.org
>
> >> https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-dev
>
> >>
>
>

-- 
~Craig
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/cfe-dev/attachments/20200906/6512e5cb/attachment-0001.html>