[cfe-dev] "Optimized implementations"?
    Stefan Kanthak via cfe-dev 
    cfe-dev at lists.llvm.org
       
    Sun Sep  6 06:25:17 PDT 2020
    
    
  
<https://compiler-rt.llvm.org/index.html> boasts:
| The builtins library provides optimized implementations of this
| and other low-level routines, either in target-independent C form,
| or as a heavily-optimized assembly.
Really?
Left: inperformant code shipped in    # Right: slightly improved code,
      clang_rt.builtins-*             #        which the optimiser REALLY
                                      #        should have generated
___cmpdi2:
        mov     ecx, [esp+16]         #       mov     ecx, [esp+16]
        xor     eax, eax              #       xor     eax, eax
        cmp     [esp+8], ecx          #       cmp     ecx, [esp+8]
        jl      @f                    #       jg      @f
        mov     eax, 2                #       mov     eax, 2
        jg      @f                    #       jl      @f
        mov     ecx, [esp+4]          #
        mov     edx, [esp+12]         #       mov     ecx, [esp+12]
        mov     eax, 0                #       xor     eax, eax
        cmp     ecx, edx              #       cmp     ecx, [esp+4]
        jb      @f                    #       ja      @f
        cmp     edx, ecx              #
        mov     eax, 1                #
        adc     eax, 0                #       adc     eax, 1
@@:                                   # @@:
        ret                           #       ret
                                      # 3 instructions less, 10 bytes saved
___ucmpdi2:
        mov     ecx, [esp+16]         #       mov     ecx, [esp+16]
        xor     eax, eax              #       xor     eax, eax
        cmp     [esp+8], ecx          #       cmp     ecx, [esp+8]
        jb      @f                    #       ja      @f
        mov     eax, 2                #       mov     eax, 2
        ja      @f                    #       jb      @f
        mov     ecx, [esp+4]          #
        mov     edx, [esp+12]         #       mov     ecx, [esp+12]
        mov     eax, 0                #       xor     eax, eax
        cmp     ecx, edx              #       cmp     ecx, [esp+4]
        jb      @f                    #       ja      @f
        cmp     edx, ecx              #
        mov     eax, 1                #
        adc     eax, 0                #       adc     eax, 1
@@:                                   # @@:
        ret                           #       ret
                                      # 3 instructions less, 10 bytes saved
Now properly written code, of course branch-free, faster and shorter:
# Copyright (C) 2004-2020, Stefan Kanthak <stefan.kanthak at nexgo.de>
___cmpdi2:
        mov     ecx, [esp+4]
        mov     edx, [esp+12]
        cmp     ecx, edx
        mov     eax, [esp+8]
        sbb     eax, [esp+16]
        setl    ah
        cmp     edx, ecx
        mov     edx, [esp+16]
        sbb     edx, [esp+8]
        setl    al
        sub     al, ah
        movsx   eax, al
        inc     eax
        ret
___ucmpdi2:
        mov     ecx, [esp+4]
        mov     edx, [esp+12]
        cmp     ecx, edx
        mov     eax, [esp+8]
        sbb     eax, [esp+16]
        sbb     eax, eax
        cmp     edx, ecx
        mov     edx, [esp+16]
        sbb     edx, [esp+8]
        adc     eax, 1
        ret
AGAIN:
Remove every occurance of the word "optimized" on the above web page.
'nuff said
Stefan
    
    
More information about the cfe-dev
mailing list