[compiler-rt] _udivdi3(), _umoddi3(), _moddi3() and _divdi3() routines not properly "tuned"

Wed Nov 8 01:29:17 PST 2017

Hi,

<http://llvm.org/> claims

| The compiler-rt project provides highly tuned implementations
| of the low-level code generator support routines like
| "__fixunsdfdi" and other calls generated when a target doesn't
| have a short sequence of native instructions to implement a
| core IR operation.

Bug #1: such "highly tuned" routines should but be implemented
~~~~~~~ without superfluous instructions!

<https://llvm.org/svn/llvm-project/compiler-rt/tags/Apple/Libcompiler_rt-8/lib/i386/moddi3.S>

 9: /* High word of b is zero on this branch */

    movl  16(%esp), %eax // Find qhi and rhi such that
    movl  20(%esp), %ecx //
    xorl  %edx, %edx // ahi = qhi*b + rhi with 0 <= rhi < b
    divl  %ecx //
-   movl  %eax, %ebx //
    movl  12(%esp), %eax // Find rlo such that
    divl  %ecx //
    movl  %edx, %eax // rhi:alo = qlo*b + rlo  with 0 <= rlo < b
    popl  %ebx //
    xorl  %edx, %edx // and return 0:rlo

<https://llvm.org/svn/llvm-project/compiler-rt/tags/Apple/Libcompiler_rt-8/lib/i386/umoddi3.S>

 9: /* High word of b is zero on this branch */

    movl  12(%esp), %eax // Find qhi and rhi such that
    movl  16(%esp), %ecx //
    xorl  %edx, %edx // ahi = qhi*b + rhi with 0 <= rhi < b
    divl  %ecx   //
-   movl  %eax, %ebx //
    movl  8(%esp), %eax // Find rlo such that
    divl  %ecx   //
    movl  %edx, %eax // rhi:alo = qlo*b + rlo  with 0 <= rlo < b
    popl  %ebx   //
    xorl  %edx, %edx // and return 0:rlo
    retl       //

Fix #1: remove the marked superfluous instructions
~~~~~~~

Bug #2: such "highly tuned" routines should not reload arguments
~~~~~~~ from stack, but reuse register contents.

<https://llvm.org/svn/llvm-project/compiler-rt/tags/Apple/Libcompiler_rt-8/lib/i386/divdi3.S>

    pushl %esi
    movl  20(%esp), %edx // high word of b
    movl  16(%esp), %eax // low word of b
...
    pushl %ebx
    movl  24(%esp), %ebx // Find the index i of the leading bit in b.
    bsrl  %ebx, %ecx // If the high word of b is zero, jump to
    jz    9f // the code to handle that special case [9].
...
 9: /* High word of b is zero on this branch */

    movl  16(%esp), %eax // Find qhi and rhi such that
    movl  20(%esp), %ecx //

Fix #2: swap the code sequences for abs(a) and abs(b), then apply
~~~~~~~ the following diff.

    pushl %ebx
-   movl  24(%esp), %ebx // Find the index i of the leading bit in b.
+   movl  %edx, %ebx     // Find the index i of the leading bit in b.
    bsrl  %ebx, %ecx // If the high word of b is zero, jump to
    jz    9f // the code to handle that special case [9].
...
 9: /* High word of b is zero on this branch */

    movl  16(%esp), %eax // Find qhi and rhi such that
-   movl  20(%esp), %ecx //
+   movl  %edx, %ecx     //

JFTR: moddi3() has this bug too!

Bug #3: such  "highly tuned" routines should but come without large
~~~~~~~ duplicate code sequences.

In the 4 routines named in the subject, the code from label 1: to the
respective return is almost identical to the code preceeding label 1:;
the only difference is the initial subtraction and the insertion of
a leading 1 into the quotient.

Fix #3: remove all lines between "jae 1f" (including the wrong
~~~~~~~ comment which follows "jae 1f") and the label 1:, then
        apply the following diff (yes, this adds one or two
        instructions to the overall execution path, but should
        typically cost no cycles, since they can execute in parallel).

+   pushl %edi
+   xorl  %edi, %edi // MSB of quotient
    cmpl  %ebx, %edx // to avoid overflowing the upcoming divide.
+   jb    0f
-   jae   1f

 1: /* High word of a is greater than or equal to (b >> (1 + i)) on this branch */

+   incl  %edi       // MSB of quotient
    subl  %ebx, %edx // subtract bhi from ahi so that divide will not
+
+0: /* High word of a is smaller than (b >> (1 + i)) on this branch */
+
    divl %ebx // overflow, and find q and r such that
    //
    // ahi:alo = (1:q)*bhi + r
    //
    // Note that q is a number in (31-i).(1+i)
    // fix point.
-   pushl %edi
    notl  %ecx
-   shrl  %eax
-   orl   $0x80000000, %eax
+   shrl  %edi
+   rcrl  %eax       // insert proper MSB into quotient

Bug #4: such "highly tuned" routines should use the fastest and/or
~~~~~~~ shortest code.

    shrl  %cl, %eax // Practically, this means that bhi is given by:
    shrl  %eax //
    notl  %ecx // bhi = (high word of b) << (31 - i) |
    shll  %cl, %ebx //   (low word of b) >> (1 + i)
    orl   %eax, %ebx //
...
    notl  %ecx

Fix #4: replace the above 6 instructions with the following 3
~~~~~~~ instructions.

    notl  %ecx
    shldl %cl, %eax, %ebx
    notl  %ecx

regards
Stefan Kanthak