[compiler-rt] _udivdi3(), _umoddi3(), _moddi3() and _divdi3() routines not properly "tuned"
Stefan Kanthak via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 8 01:29:17 PST 2017
Hi,
<http://llvm.org/> claims
| The compiler-rt project provides highly tuned implementations
| of the low-level code generator support routines like
| "__fixunsdfdi" and other calls generated when a target doesn't
| have a short sequence of native instructions to implement a
| core IR operation.
Bug #1: such "highly tuned" routines should but be implemented
~~~~~~~ without superfluous instructions!
<https://llvm.org/svn/llvm-project/compiler-rt/tags/Apple/Libcompiler_rt-8/lib/i386/moddi3.S>
9: /* High word of b is zero on this branch */
movl 16(%esp), %eax // Find qhi and rhi such that
movl 20(%esp), %ecx //
xorl %edx, %edx // ahi = qhi*b + rhi with 0 <= rhi < b
divl %ecx //
- movl %eax, %ebx //
movl 12(%esp), %eax // Find rlo such that
divl %ecx //
movl %edx, %eax // rhi:alo = qlo*b + rlo with 0 <= rlo < b
popl %ebx //
xorl %edx, %edx // and return 0:rlo
<https://llvm.org/svn/llvm-project/compiler-rt/tags/Apple/Libcompiler_rt-8/lib/i386/umoddi3.S>
9: /* High word of b is zero on this branch */
movl 12(%esp), %eax // Find qhi and rhi such that
movl 16(%esp), %ecx //
xorl %edx, %edx // ahi = qhi*b + rhi with 0 <= rhi < b
divl %ecx //
- movl %eax, %ebx //
movl 8(%esp), %eax // Find rlo such that
divl %ecx //
movl %edx, %eax // rhi:alo = qlo*b + rlo with 0 <= rlo < b
popl %ebx //
xorl %edx, %edx // and return 0:rlo
retl //
Fix #1: remove the marked superfluous instructions
~~~~~~~
Bug #2: such "highly tuned" routines should not reload arguments
~~~~~~~ from stack, but reuse register contents.
<https://llvm.org/svn/llvm-project/compiler-rt/tags/Apple/Libcompiler_rt-8/lib/i386/divdi3.S>
pushl %esi
movl 20(%esp), %edx // high word of b
movl 16(%esp), %eax // low word of b
...
pushl %ebx
movl 24(%esp), %ebx // Find the index i of the leading bit in b.
bsrl %ebx, %ecx // If the high word of b is zero, jump to
jz 9f // the code to handle that special case [9].
...
9: /* High word of b is zero on this branch */
movl 16(%esp), %eax // Find qhi and rhi such that
movl 20(%esp), %ecx //
Fix #2: swap the code sequences for abs(a) and abs(b), then apply
~~~~~~~ the following diff.
pushl %ebx
- movl 24(%esp), %ebx // Find the index i of the leading bit in b.
+ movl %edx, %ebx // Find the index i of the leading bit in b.
bsrl %ebx, %ecx // If the high word of b is zero, jump to
jz 9f // the code to handle that special case [9].
...
9: /* High word of b is zero on this branch */
movl 16(%esp), %eax // Find qhi and rhi such that
- movl 20(%esp), %ecx //
+ movl %edx, %ecx //
JFTR: moddi3() has this bug too!
Bug #3: such "highly tuned" routines should but come without large
~~~~~~~ duplicate code sequences.
In the 4 routines named in the subject, the code from label 1: to the
respective return is almost identical to the code preceeding label 1:;
the only difference is the initial subtraction and the insertion of
a leading 1 into the quotient.
Fix #3: remove all lines between "jae 1f" (including the wrong
~~~~~~~ comment which follows "jae 1f") and the label 1:, then
apply the following diff (yes, this adds one or two
instructions to the overall execution path, but should
typically cost no cycles, since they can execute in parallel).
+ pushl %edi
+ xorl %edi, %edi // MSB of quotient
cmpl %ebx, %edx // to avoid overflowing the upcoming divide.
+ jb 0f
- jae 1f
1: /* High word of a is greater than or equal to (b >> (1 + i)) on this branch */
+ incl %edi // MSB of quotient
subl %ebx, %edx // subtract bhi from ahi so that divide will not
+
+0: /* High word of a is smaller than (b >> (1 + i)) on this branch */
+
divl %ebx // overflow, and find q and r such that
//
// ahi:alo = (1:q)*bhi + r
//
// Note that q is a number in (31-i).(1+i)
// fix point.
- pushl %edi
notl %ecx
- shrl %eax
- orl $0x80000000, %eax
+ shrl %edi
+ rcrl %eax // insert proper MSB into quotient
Bug #4: such "highly tuned" routines should use the fastest and/or
~~~~~~~ shortest code.
shrl %cl, %eax // Practically, this means that bhi is given by:
shrl %eax //
notl %ecx // bhi = (high word of b) << (31 - i) |
shll %cl, %ebx // (low word of b) >> (1 + i)
orl %eax, %ebx //
...
notl %ecx
Fix #4: replace the above 6 instructions with the following 3
~~~~~~~ instructions.
notl %ecx
shldl %cl, %eax, %ebx
notl %ecx
regards
Stefan Kanthak
More information about the llvm-commits
mailing list