[compiler-rt] r200001 - Provide support for ARMv4, lacking bx and clz. Unroll the
Bob Wilson
bob.wilson at apple.com
Fri Jan 24 08:22:49 PST 2014
This has broken all of our buildbots because we also build this code for armv7m, which is Thumb and where the “.arm” directive is not supported. Can you fix it right away and/or revert until it gets sorted out?
On Jan 24, 2014, at 5:43 AM, Joerg Sonnenberger <joerg at bec.de> wrote:
> Author: joerg
> Date: Fri Jan 24 07:43:35 2014
> New Revision: 200001
>
> URL: http://llvm.org/viewvc/llvm-project?rev=200001&view=rev
> Log:
> Provide support for ARMv4, lacking bx and clz. Unroll the
> test-and-subtract loop and compute the initial block as address,
> shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi.
> Code written by Matt Thomas and Joerg Sonnenberger.
>
> Differential Revision: http://llvm-reviews.chandlerc.com/D2595
>
> Modified:
> compiler-rt/trunk/lib/arm/udivmodsi4.S
> compiler-rt/trunk/lib/arm/udivsi3.S
> compiler-rt/trunk/lib/arm/umodsi3.S
>
> Modified: compiler-rt/trunk/lib/arm/udivmodsi4.S
> URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivmodsi4.S?rev=200001&r1=200000&r2=200001&view=diff
> ==============================================================================
> --- compiler-rt/trunk/lib/arm/udivmodsi4.S (original)
> +++ compiler-rt/trunk/lib/arm/udivmodsi4.S Fri Jan 24 07:43:35 2014
> @@ -8,89 +8,161 @@
> *===----------------------------------------------------------------------===//
> *
> * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
> - * modulus) function for the ARM architecture. A naive digit-by-digit
> - * computation is employed for simplicity.
> + * modulus) function for the ARM 32-bit architecture.
> *
> *===----------------------------------------------------------------------===*/
>
> #include "../assembly.h"
>
> -#define ESTABLISH_FRAME \
> - push {r4, r7, lr} ;\
> - add r7, sp, #4
> -#define CLEAR_FRAME_AND_RETURN \
> - pop {r4, r7, pc}
> -
> -#define a r0
> -#define b r1
> -#define i r3
> -#define r r4
> -#define q ip
> -#define one lr
> + .syntax unified
>
> -.syntax unified
> -.align 3
> +#ifdef ARM_HAS_BX
> +#define JMP(r) bx r
> +#else
> +#define JMP(r) mov pc, r
> +#endif
> +
> + .text
> + .arm
> + .p2align 2
> DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
> #if __ARM_ARCH_EXT_IDIV__
> tst r1, r1
> - beq LOCAL_LABEL(divzero)
> + beq LOCAL_LABEL(divby0)
> mov r3, r0
> udiv r0, r3, r1
> mls r1, r0, r1, r3
> str r1, [r2]
> bx lr
> -LOCAL_LABEL(divzero):
> - mov r0, #0
> - bx lr
> #else
> -// We use a simple digit by digit algorithm; before we get into the actual
> -// divide loop, we must calculate the left-shift amount necessary to align
> -// the MSB of the divisor with that of the dividend (If this shift is
> -// negative, then the result is zero, and we early out). We also conjure a
> -// bit mask of 1 to use in constructing the quotient, and initialize the
> -// quotient to zero.
> - ESTABLISH_FRAME
> - clz r4, a
> - tst b, b // detect divide-by-zero
> - clz r3, b
> - mov q, #0
> - beq LOCAL_LABEL(return) // return 0 if b is zero.
> - mov one, #1
> - subs i, r3, r4
> - blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b)
> -
> -LOCAL_LABEL(mainLoop):
> -// This loop basically implements the following:
> -//
> -// do {
> -// if (a >= b << i) {
> -// a -= b << i;
> -// q |= 1 << i;
> -// if (a == 0) break;
> -// }
> -// } while (--i)
> -//
> -// Note that this does not perform the final iteration (i == 0); by doing it
> -// this way, we can merge the two branches which is a substantial win for
> -// such a tight loop on current ARM architectures.
> - subs r, a, b, lsl i
> - itt hs
> - orrhs q, q,one, lsl i
> - movhs a, r
> - it ne
> - subsne i, i, #1
> - bhi LOCAL_LABEL(mainLoop)
> -
> -// Do the final test subtraction and update of quotient (i == 0), as it is
> -// not performed in the main loop.
> - subs r, a, b
> - itt hs
> - orrhs q, #1
> - movhs a, r
> -
> -LOCAL_LABEL(return):
> -// Store the remainder, and move the quotient to r0, then return.
> - str a, [r2]
> - mov r0, q
> - CLEAR_FRAME_AND_RETURN
> + cmp r1, #1
> + bcc LOCAL_LABEL(divby0)
> + beq LOCAL_LABEL(divby1)
> + cmp r0, r1
> + bcc LOCAL_LABEL(quotient0)
> + /*
> + * Implement division using binary long division algorithm.
> + *
> + * r0 is the numerator, r1 the denominator.
> + *
> + * The code before JMP computes the correct shift I, so that
> + * r0 and (r1 << I) have the highest bit set in the same position.
> + * At the time of JMP, ip := .Ldiv0block - 12 * I.
> + * This depends on the fixed instruction size of block.
> + *
> + * block(shift) implements the test-and-update-quotient core.
> + * It assumes (r0 << shift) can be computed without overflow and
> + * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
> + */
> +
> +# ifdef __ARM_FEATURE_CLZ
> + clz ip, r0
> + clz r3, r1
> + /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
> + sub r3, r3, ip
> + adr ip, LOCAL_LABEL(div0block)
> + sub ip, ip, r3, lsl #2
> + sub ip, ip, r3, lsl #3
> + mov r3, #0
> + bx ip
> +# else
> + str r4, [sp, #-8]!
> +
> + mov r4, r0
> + adr ip, LOCAL_LABEL(div0block)
> +
> + lsr r3, r4, #16
> + cmp r3, r1
> + movhs r4, r3
> + subhs ip, ip, #(16 * 12)
> +
> + lsr r3, r4, #8
> + cmp r3, r1
> + movhs r4, r3
> + subhs ip, ip, #(8 * 12)
> +
> + lsr r3, r4, #4
> + cmp r3, r1
> + movhs r4, r3
> + subhs ip, #(4 * 12)
> +
> + lsr r3, r4, #2
> + cmp r3, r1
> + movhs r4, r3
> + subhs ip, ip, #(2 * 12)
> +
> + /* Last block, no need to update r3 or r4. */
> + cmp r1, r4, lsr #1
> + subls ip, ip, #(1 * 12)
> +
> + ldr r4, [sp], #8 /* restore r4, we are done with it. */
> + mov r3, #0
> +
> + JMP(ip)
> +# endif
> +
> +#define IMM #
> +
> +#define block(shift) \
> + cmp r0, r1, lsl IMM shift; \
> + addhs r3, r3, IMM (1 << shift); \
> + subhs r0, r0, r1, lsl IMM shift
> +
> + block(31)
> + block(30)
> + block(29)
> + block(28)
> + block(27)
> + block(26)
> + block(25)
> + block(24)
> + block(23)
> + block(22)
> + block(21)
> + block(20)
> + block(19)
> + block(18)
> + block(17)
> + block(16)
> + block(15)
> + block(14)
> + block(13)
> + block(12)
> + block(11)
> + block(10)
> + block(9)
> + block(8)
> + block(7)
> + block(6)
> + block(5)
> + block(4)
> + block(3)
> + block(2)
> + block(1)
> +LOCAL_LABEL(div0block):
> + block(0)
> +
> + str r0, [r2]
> + mov r0, r3
> + JMP(lr)
> +
> +LOCAL_LABEL(quotient0):
> + str r0, [r2]
> + mov r0, #0
> + JMP(lr)
> +
> +LOCAL_LABEL(divby1):
> + mov r3, #0
> + str r3, [r2]
> + JMP(lr)
> +#endif /* __ARM_ARCH_EXT_IDIV__ */
> +
> +LOCAL_LABEL(divby0):
> + mov r0, #0
> +#ifdef __ARM_EABI__
> + b __aeabi_idiv0
> +#else
> + JMP(lr)
> #endif
> +
> +END_COMPILERRT_FUNCTION(__udivmodsi4)
>
> Modified: compiler-rt/trunk/lib/arm/udivsi3.S
> URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivsi3.S?rev=200001&r1=200000&r2=200001&view=diff
> ==============================================================================
> --- compiler-rt/trunk/lib/arm/udivsi3.S (original)
> +++ compiler-rt/trunk/lib/arm/udivsi3.S Fri Jan 24 07:43:35 2014
> @@ -1,4 +1,4 @@
> -/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
> +/*===-- udivmodsi4.S - 32-bit unsigned integer divide ---------------------===//
> *
> * The LLVM Compiler Infrastructure
> *
> @@ -7,87 +7,151 @@
> *
> *===----------------------------------------------------------------------===//
> *
> - * This file implements the __udivsi3 (32-bit unsigned integer divide)
> - * function for the ARM architecture. A naive digit-by-digit computation is
> - * employed for simplicity.
> + * This file implements the __udivsi3 (32-bit unsigned integer divide)
> + * function for the ARM 32-bit architecture.
> *
> *===----------------------------------------------------------------------===*/
>
> #include "../assembly.h"
>
> -#define ESTABLISH_FRAME \
> - push {r7, lr} ;\
> - mov r7, sp
> -#define CLEAR_FRAME_AND_RETURN \
> - pop {r7, pc}
> -
> -#define a r0
> -#define b r1
> -#define r r2
> -#define i r3
> -#define q ip
> -#define one lr
> -
> -.syntax unified
> -.align 3
> -// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine.
> + .syntax unified
> +
> +#ifdef ARM_HAS_BX
> +#define JMP(r) bx r
> +#define JMPc(r,c) bx##c r
> +#else
> +#define JMP(r) mov pc, r
> +#define JMPc(r,c) mov##c pc, r
> +#endif
> +
> + .text
> + .arm
> + .p2align 2
> DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
> DEFINE_COMPILERRT_FUNCTION(__udivsi3)
> #if __ARM_ARCH_EXT_IDIV__
> - tst r1,r1
> - beq LOCAL_LABEL(divzero)
> - udiv r0, r0, r1
> + tst r1, r1
> + beq LOCAL_LABEL(divby0)
> + mov r3, r0
> + udiv r0, r3, r1
> + mls r1, r0, r1, r3
> bx lr
> - LOCAL_LABEL(divzero):
> - mov r0,#0
> - bx lr
> #else
> -// We use a simple digit by digit algorithm; before we get into the actual
> -// divide loop, we must calculate the left-shift amount necessary to align
> -// the MSB of the divisor with that of the dividend (If this shift is
> -// negative, then the result is zero, and we early out). We also conjure a
> -// bit mask of 1 to use in constructing the quotient, and initialize the
> -// quotient to zero.
> - ESTABLISH_FRAME
> - clz r2, a
> - tst b, b // detect divide-by-zero
> - clz r3, b
> - mov q, #0
> - beq LOCAL_LABEL(return) // return 0 if b is zero.
> - mov one, #1
> - subs i, r3, r2
> - blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b)
> -
> -LOCAL_LABEL(mainLoop):
> -// This loop basically implements the following:
> -//
> -// do {
> -// if (a >= b << i) {
> -// a -= b << i;
> -// q |= 1 << i;
> -// if (a == 0) break;
> -// }
> -// } while (--i)
> -//
> -// Note that this does not perform the final iteration (i == 0); by doing it
> -// this way, we can merge the two branches which is a substantial win for
> -// such a tight loop on current ARM architectures.
> - subs r, a, b, lsl i
> - itt hs
> - orrhs q, q,one, lsl i
> - movhs a, r
> - it ne
> - subsne i, i, #1
> - bhi LOCAL_LABEL(mainLoop)
> -
> -// Do the final test subtraction and update of quotient (i == 0), as it is
> -// not performed in the main loop.
> - subs r, a, b
> - it hs
> - orrhs q, #1
> -
> -LOCAL_LABEL(return):
> -// Move the quotient to r0 and return.
> - mov r0, q
> - CLEAR_FRAME_AND_RETURN
> + cmp r1, #1
> + bcc LOCAL_LABEL(divby0)
> + JMPc(lr, eq)
> + cmp r0, r1
> + movcc r0, #0
> + JMPc(lr, cc)
> + /*
> + * Implement division using binary long division algorithm.
> + *
> + * r0 is the numerator, r1 the denominator.
> + *
> + * The code before JMP computes the correct shift I, so that
> + * r0 and (r1 << I) have the highest bit set in the same position.
> + * At the time of JMP, ip := .Ldiv0block - 12 * I.
> + * This depends on the fixed instruction size of block.
> + *
> + * block(shift) implements the test-and-update-quotient core.
> + * It assumes (r0 << shift) can be computed without overflow and
> + * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
> + */
> +
> +# ifdef __ARM_FEATURE_CLZ
> + clz ip, r0
> + clz r3, r1
> + /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
> + sub r3, r3, ip
> + adr ip, LOCAL_LABEL(div0block)
> + sub ip, ip, r3, lsl #2
> + sub ip, ip, r3, lsl #3
> + mov r3, #0
> + bx ip
> +# else
> + mov r2, r0
> + adr ip, LOCAL_LABEL(div0block)
> +
> + lsr r3, r2, #16
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, ip, #(16 * 12)
> +
> + lsr r3, r2, #8
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, ip, #(8 * 12)
> +
> + lsr r3, r2, #4
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, #(4 * 12)
> +
> + lsr r3, r2, #2
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, ip, #(2 * 12)
> +
> + /* Last block, no need to update r2 or r3. */
> + cmp r1, r2, lsr #1
> + subls ip, ip, #(1 * 12)
> +
> + mov r3, #0
> +
> + JMP(ip)
> +# endif
> +
> +#define IMM #
> +
> +#define block(shift) \
> + cmp r0, r1, lsl IMM shift; \
> + addhs r3, r3, IMM (1 << shift); \
> + subhs r0, r0, r1, lsl IMM shift
> +
> + block(31)
> + block(30)
> + block(29)
> + block(28)
> + block(27)
> + block(26)
> + block(25)
> + block(24)
> + block(23)
> + block(22)
> + block(21)
> + block(20)
> + block(19)
> + block(18)
> + block(17)
> + block(16)
> + block(15)
> + block(14)
> + block(13)
> + block(12)
> + block(11)
> + block(10)
> + block(9)
> + block(8)
> + block(7)
> + block(6)
> + block(5)
> + block(4)
> + block(3)
> + block(2)
> + block(1)
> +LOCAL_LABEL(div0block):
> + block(0)
> +
> + mov r0, r3
> + JMP(lr)
> +#endif /* __ARM_ARCH_EXT_IDIV__ */
> +
> +LOCAL_LABEL(divby0):
> + mov r0, #0
> +#ifdef __ARM_EABI__
> + b __aeabi_idiv0
> +#else
> + JMP(lr)
> #endif
> +
> +END_COMPILERRT_FUNCTION(__udivsi3)
>
> Modified: compiler-rt/trunk/lib/arm/umodsi3.S
> URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/umodsi3.S?rev=200001&r1=200000&r2=200001&view=diff
> ==============================================================================
> --- compiler-rt/trunk/lib/arm/umodsi3.S (original)
> +++ compiler-rt/trunk/lib/arm/umodsi3.S Fri Jan 24 07:43:35 2014
> @@ -1,4 +1,4 @@
> -/*===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===//
> +/*===-- udivmodsi4.S - 32-bit unsigned integer modulus --------------------===//
> *
> * The LLVM Compiler Infrastructure
> *
> @@ -7,66 +7,144 @@
> *
> *===----------------------------------------------------------------------===//
> *
> - * This file implements the __umodsi3 (32-bit unsigned integer modulus)
> - * function for the ARM architecture. A naive digit-by-digit computation is
> - * employed for simplicity.
> + * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
> + * modulus) function for the ARM 32-bit architecture.
> *
> *===----------------------------------------------------------------------===*/
>
> #include "../assembly.h"
>
> -#define a r0
> -#define b r1
> -#define r r2
> -#define i r3
> + .syntax unified
>
> -.syntax unified
> -.align 3
> +#ifdef ARM_HAS_BX
> +#define JMP(r) bx r
> +#define JMPc(r,c) bx##c r
> +#else
> +#define JMP(r) mov pc, r
> +#define JMPc(r,c) mov##c pc, r
> +#endif
> +
> + .text
> + .arm
> + .p2align 2
> DEFINE_COMPILERRT_FUNCTION(__umodsi3)
> #if __ARM_ARCH_EXT_IDIV__
> tst r1, r1
> - beq LOCAL_LABEL(divzero)
> - udiv r2, r0, r1
> - mls r0, r2, r1, r0
> - bx lr
> -LOCAL_LABEL(divzero):
> - mov r0, #0
> - bx lr
> + beq LOCAL_LABEL(divby0)
> + mov r3, r0
> + udiv r0, r3, r1
> + mls r1, r0, r1, r3
> + str r1, [r2]
> + bx lr
> +#else
> + cmp r1, #1
> + bcc LOCAL_LABEL(divby0)
> + moveq r0, #0
> + JMPc(lr, eq)
> + cmp r0, r1
> + JMPc(lr, cc)
> + /*
> + * Implement division using binary long division algorithm.
> + *
> + * r0 is the numerator, r1 the denominator.
> + *
> + * The code before JMP computes the correct shift I, so that
> + * r0 and (r1 << I) have the highest bit set in the same position.
> + * At the time of JMP, ip := .Ldiv0block - 8 * I.
> + * This depends on the fixed instruction size of block.
> + *
> + * block(shift) implements the test-and-update-quotient core.
> + * It assumes (r0 << shift) can be computed without overflow and
> + * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
> + */
> +
> +# ifdef __ARM_FEATURE_CLZ
> + clz ip, r0
> + clz r3, r1
> + /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
> + sub r3, r3, ip
> + adr ip, LOCAL_LABEL(div0block)
> + sub ip, ip, r3, lsl #3
> + bx ip
> +# else
> + mov r2, r0
> + adr ip, LOCAL_LABEL(div0block)
> +
> + lsr r3, r2, #16
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, ip, #(16 * 8)
> +
> + lsr r3, r2, #8
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, ip, #(8 * 8)
> +
> + lsr r3, r2, #4
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, #(4 * 8)
> +
> + lsr r3, r2, #2
> + cmp r3, r1
> + movhs r2, r3
> + subhs ip, ip, #(2 * 8)
> +
> + /* Last block, no need to update r2 or r3. */
> + cmp r1, r2, lsr #1
> + subls ip, ip, #(1 * 8)
> +
> + JMP(ip)
> +# endif
> +
> +#define IMM #
> +
> +#define block(shift) \
> + cmp r0, r1, lsl IMM shift; \
> + subhs r0, r0, r1, lsl IMM shift
> +
> + block(31)
> + block(30)
> + block(29)
> + block(28)
> + block(27)
> + block(26)
> + block(25)
> + block(24)
> + block(23)
> + block(22)
> + block(21)
> + block(20)
> + block(19)
> + block(18)
> + block(17)
> + block(16)
> + block(15)
> + block(14)
> + block(13)
> + block(12)
> + block(11)
> + block(10)
> + block(9)
> + block(8)
> + block(7)
> + block(6)
> + block(5)
> + block(4)
> + block(3)
> + block(2)
> + block(1)
> +LOCAL_LABEL(div0block):
> + block(0)
> + JMP(lr)
> +#endif /* __ARM_ARCH_EXT_IDIV__ */
> +
> +LOCAL_LABEL(divby0):
> + mov r0, #0
> +#ifdef __ARM_EABI__
> + b __aeabi_idiv0
> #else
> -// We use a simple digit by digit algorithm; before we get into the actual
> -// divide loop, we must calculate the left-shift amount necessary to align
> -// the MSB of the divisor with that of the dividend.
> - clz r2, a
> - tst b, b // detect b == 0
> - clz r3, b
> - bxeq lr // return a if b == 0
> - subs i, r3, r2
> - bxlt lr // return a if MSB(a) < MSB(b)
> -
> -LOCAL_LABEL(mainLoop):
> -// This loop basically implements the following:
> -//
> -// do {
> -// if (a >= b << i) {
> -// a -= b << i;
> -// if (a == 0) break;
> -// }
> -// } while (--i)
> -//
> -// Note that this does not perform the final iteration (i == 0); by doing it
> -// this way, we can merge the two branches which is a substantial win for
> -// such a tight loop on current ARM architectures.
> - subs r, a, b, lsl i
> - it hs
> - movhs a, r
> - it ne
> - subsne i, i, #1
> - bhi LOCAL_LABEL(mainLoop)
> -
> -// Do the final test subtraction and update of remainder (i == 0), as it is
> -// not performed in the main loop.
> - subs r, a, b
> - it hs
> - movhs a, r
> - bx lr
> + JMP(lr)
> #endif
> +
> +END_COMPILERRT_FUNCTION(__umodsi3)
>
>
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
More information about the llvm-commits
mailing list