[cfe-dev] "Optimized implementations"?

Sun Sep 6 06:25:17 PDT 2020

<https://compiler-rt.llvm.org/index.html> boasts:

| The builtins library provides optimized implementations of this
| and other low-level routines, either in target-independent C form,
| or as a heavily-optimized assembly.

Really?

Left: inperformant code shipped in    # Right: slightly improved code,
      clang_rt.builtins-*             #        which the optimiser REALLY
                                      #        should have generated

___cmpdi2:
        mov     ecx, [esp+16]         #       mov     ecx, [esp+16]
        xor     eax, eax              #       xor     eax, eax
        cmp     [esp+8], ecx          #       cmp     ecx, [esp+8]
        jl      @f                    #       jg      @f
        mov     eax, 2                #       mov     eax, 2
        jg      @f                    #       jl      @f
        mov     ecx, [esp+4]          #
        mov     edx, [esp+12]         #       mov     ecx, [esp+12]
        mov     eax, 0                #       xor     eax, eax
        cmp     ecx, edx              #       cmp     ecx, [esp+4]
        jb      @f                    #       ja      @f
        cmp     edx, ecx              #
        mov     eax, 1                #
        adc     eax, 0                #       adc     eax, 1
@@:                                   # @@:
        ret                           #       ret

                                      # 3 instructions less, 10 bytes saved

___ucmpdi2:
        mov     ecx, [esp+16]         #       mov     ecx, [esp+16]
        xor     eax, eax              #       xor     eax, eax
        cmp     [esp+8], ecx          #       cmp     ecx, [esp+8]
        jb      @f                    #       ja      @f
        mov     eax, 2                #       mov     eax, 2
        ja      @f                    #       jb      @f
        mov     ecx, [esp+4]          #
        mov     edx, [esp+12]         #       mov     ecx, [esp+12]
        mov     eax, 0                #       xor     eax, eax
        cmp     ecx, edx              #       cmp     ecx, [esp+4]
        jb      @f                    #       ja      @f
        cmp     edx, ecx              #
        mov     eax, 1                #
        adc     eax, 0                #       adc     eax, 1
@@:                                   # @@:
        ret                           #       ret

                                      # 3 instructions less, 10 bytes saved

Now properly written code, of course branch-free, faster and shorter:

# Copyright (C) 2004-2020, Stefan Kanthak <stefan.kanthak at nexgo.de>

___cmpdi2:
        mov     ecx, [esp+4]
        mov     edx, [esp+12]
        cmp     ecx, edx
        mov     eax, [esp+8]
        sbb     eax, [esp+16]
        setl    ah
        cmp     edx, ecx
        mov     edx, [esp+16]
        sbb     edx, [esp+8]
        setl    al
        sub     al, ah
        movsx   eax, al
        inc     eax
        ret

___ucmpdi2:
        mov     ecx, [esp+4]
        mov     edx, [esp+12]
        cmp     ecx, edx
        mov     eax, [esp+8]
        sbb     eax, [esp+16]
        sbb     eax, eax
        cmp     edx, ecx
        mov     edx, [esp+16]
        sbb     edx, [esp+8]
        adc     eax, 1
        ret

AGAIN:
Remove every occurance of the word "optimized" on the above web page.

'nuff said
Stefan