[compiler-rt] r200001 - Provide support for ARMv4, lacking bx and clz. Unroll the
Joerg Sonnenberger
joerg at bec.de
Fri Jan 24 05:43:35 PST 2014
Author: joerg
Date: Fri Jan 24 07:43:35 2014
New Revision: 200001
URL: http://llvm.org/viewvc/llvm-project?rev=200001&view=rev
Log:
Provide support for ARMv4, lacking bx and clz. Unroll the
test-and-subtract loop and compute the initial block as address,
shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi.
Code written by Matt Thomas and Joerg Sonnenberger.
Differential Revision: http://llvm-reviews.chandlerc.com/D2595
Modified:
compiler-rt/trunk/lib/arm/udivmodsi4.S
compiler-rt/trunk/lib/arm/udivsi3.S
compiler-rt/trunk/lib/arm/umodsi3.S
Modified: compiler-rt/trunk/lib/arm/udivmodsi4.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivmodsi4.S?rev=200001&r1=200000&r2=200001&view=diff
==============================================================================
--- compiler-rt/trunk/lib/arm/udivmodsi4.S (original)
+++ compiler-rt/trunk/lib/arm/udivmodsi4.S Fri Jan 24 07:43:35 2014
@@ -8,89 +8,161 @@
*===----------------------------------------------------------------------===//
*
* This file implements the __udivmodsi4 (32-bit unsigned integer divide and
- * modulus) function for the ARM architecture. A naive digit-by-digit
- * computation is employed for simplicity.
+ * modulus) function for the ARM 32-bit architecture.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
-#define ESTABLISH_FRAME \
- push {r4, r7, lr} ;\
- add r7, sp, #4
-#define CLEAR_FRAME_AND_RETURN \
- pop {r4, r7, pc}
-
-#define a r0
-#define b r1
-#define i r3
-#define r r4
-#define q ip
-#define one lr
+ .syntax unified
-.syntax unified
-.align 3
+#ifdef ARM_HAS_BX
+#define JMP(r) bx r
+#else
+#define JMP(r) mov pc, r
+#endif
+
+ .text
+ .arm
+ .p2align 2
DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
#if __ARM_ARCH_EXT_IDIV__
tst r1, r1
- beq LOCAL_LABEL(divzero)
+ beq LOCAL_LABEL(divby0)
mov r3, r0
udiv r0, r3, r1
mls r1, r0, r1, r3
str r1, [r2]
bx lr
-LOCAL_LABEL(divzero):
- mov r0, #0
- bx lr
#else
-// We use a simple digit by digit algorithm; before we get into the actual
-// divide loop, we must calculate the left-shift amount necessary to align
-// the MSB of the divisor with that of the dividend (If this shift is
-// negative, then the result is zero, and we early out). We also conjure a
-// bit mask of 1 to use in constructing the quotient, and initialize the
-// quotient to zero.
- ESTABLISH_FRAME
- clz r4, a
- tst b, b // detect divide-by-zero
- clz r3, b
- mov q, #0
- beq LOCAL_LABEL(return) // return 0 if b is zero.
- mov one, #1
- subs i, r3, r4
- blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b)
-
-LOCAL_LABEL(mainLoop):
-// This loop basically implements the following:
-//
-// do {
-// if (a >= b << i) {
-// a -= b << i;
-// q |= 1 << i;
-// if (a == 0) break;
-// }
-// } while (--i)
-//
-// Note that this does not perform the final iteration (i == 0); by doing it
-// this way, we can merge the two branches which is a substantial win for
-// such a tight loop on current ARM architectures.
- subs r, a, b, lsl i
- itt hs
- orrhs q, q,one, lsl i
- movhs a, r
- it ne
- subsne i, i, #1
- bhi LOCAL_LABEL(mainLoop)
-
-// Do the final test subtraction and update of quotient (i == 0), as it is
-// not performed in the main loop.
- subs r, a, b
- itt hs
- orrhs q, #1
- movhs a, r
-
-LOCAL_LABEL(return):
-// Store the remainder, and move the quotient to r0, then return.
- str a, [r2]
- mov r0, q
- CLEAR_FRAME_AND_RETURN
+ cmp r1, #1
+ bcc LOCAL_LABEL(divby0)
+ beq LOCAL_LABEL(divby1)
+ cmp r0, r1
+ bcc LOCAL_LABEL(quotient0)
+ /*
+ * Implement division using binary long division algorithm.
+ *
+ * r0 is the numerator, r1 the denominator.
+ *
+ * The code before JMP computes the correct shift I, so that
+ * r0 and (r1 << I) have the highest bit set in the same position.
+ * At the time of JMP, ip := .Ldiv0block - 12 * I.
+ * This depends on the fixed instruction size of block.
+ *
+ * block(shift) implements the test-and-update-quotient core.
+ * It assumes (r0 << shift) can be computed without overflow and
+ * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
+ */
+
+# ifdef __ARM_FEATURE_CLZ
+ clz ip, r0
+ clz r3, r1
+ /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+ sub r3, r3, ip
+ adr ip, LOCAL_LABEL(div0block)
+ sub ip, ip, r3, lsl #2
+ sub ip, ip, r3, lsl #3
+ mov r3, #0
+ bx ip
+# else
+ str r4, [sp, #-8]!
+
+ mov r4, r0
+ adr ip, LOCAL_LABEL(div0block)
+
+ lsr r3, r4, #16
+ cmp r3, r1
+ movhs r4, r3
+ subhs ip, ip, #(16 * 12)
+
+ lsr r3, r4, #8
+ cmp r3, r1
+ movhs r4, r3
+ subhs ip, ip, #(8 * 12)
+
+ lsr r3, r4, #4
+ cmp r3, r1
+ movhs r4, r3
+ subhs ip, #(4 * 12)
+
+ lsr r3, r4, #2
+ cmp r3, r1
+ movhs r4, r3
+ subhs ip, ip, #(2 * 12)
+
+ /* Last block, no need to update r3 or r4. */
+ cmp r1, r4, lsr #1
+ subls ip, ip, #(1 * 12)
+
+ ldr r4, [sp], #8 /* restore r4, we are done with it. */
+ mov r3, #0
+
+ JMP(ip)
+# endif
+
+#define IMM #
+
+#define block(shift) \
+ cmp r0, r1, lsl IMM shift; \
+ addhs r3, r3, IMM (1 << shift); \
+ subhs r0, r0, r1, lsl IMM shift
+
+ block(31)
+ block(30)
+ block(29)
+ block(28)
+ block(27)
+ block(26)
+ block(25)
+ block(24)
+ block(23)
+ block(22)
+ block(21)
+ block(20)
+ block(19)
+ block(18)
+ block(17)
+ block(16)
+ block(15)
+ block(14)
+ block(13)
+ block(12)
+ block(11)
+ block(10)
+ block(9)
+ block(8)
+ block(7)
+ block(6)
+ block(5)
+ block(4)
+ block(3)
+ block(2)
+ block(1)
+LOCAL_LABEL(div0block):
+ block(0)
+
+ str r0, [r2]
+ mov r0, r3
+ JMP(lr)
+
+LOCAL_LABEL(quotient0):
+ str r0, [r2]
+ mov r0, #0
+ JMP(lr)
+
+LOCAL_LABEL(divby1):
+ mov r3, #0
+ str r3, [r2]
+ JMP(lr)
+#endif /* __ARM_ARCH_EXT_IDIV__ */
+
+LOCAL_LABEL(divby0):
+ mov r0, #0
+#ifdef __ARM_EABI__
+ b __aeabi_idiv0
+#else
+ JMP(lr)
#endif
+
+END_COMPILERRT_FUNCTION(__udivmodsi4)
Modified: compiler-rt/trunk/lib/arm/udivsi3.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivsi3.S?rev=200001&r1=200000&r2=200001&view=diff
==============================================================================
--- compiler-rt/trunk/lib/arm/udivsi3.S (original)
+++ compiler-rt/trunk/lib/arm/udivsi3.S Fri Jan 24 07:43:35 2014
@@ -1,4 +1,4 @@
-/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
+/*===-- udivmodsi4.S - 32-bit unsigned integer divide ---------------------===//
*
* The LLVM Compiler Infrastructure
*
@@ -7,87 +7,151 @@
*
*===----------------------------------------------------------------------===//
*
- * This file implements the __udivsi3 (32-bit unsigned integer divide)
- * function for the ARM architecture. A naive digit-by-digit computation is
- * employed for simplicity.
+ * This file implements the __udivsi3 (32-bit unsigned integer divide)
+ * function for the ARM 32-bit architecture.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
-#define ESTABLISH_FRAME \
- push {r7, lr} ;\
- mov r7, sp
-#define CLEAR_FRAME_AND_RETURN \
- pop {r7, pc}
-
-#define a r0
-#define b r1
-#define r r2
-#define i r3
-#define q ip
-#define one lr
-
-.syntax unified
-.align 3
-// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine.
+ .syntax unified
+
+#ifdef ARM_HAS_BX
+#define JMP(r) bx r
+#define JMPc(r,c) bx##c r
+#else
+#define JMP(r) mov pc, r
+#define JMPc(r,c) mov##c pc, r
+#endif
+
+ .text
+ .arm
+ .p2align 2
DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
DEFINE_COMPILERRT_FUNCTION(__udivsi3)
#if __ARM_ARCH_EXT_IDIV__
- tst r1,r1
- beq LOCAL_LABEL(divzero)
- udiv r0, r0, r1
+ tst r1, r1
+ beq LOCAL_LABEL(divby0)
+ mov r3, r0
+ udiv r0, r3, r1
+ mls r1, r0, r1, r3
bx lr
- LOCAL_LABEL(divzero):
- mov r0,#0
- bx lr
#else
-// We use a simple digit by digit algorithm; before we get into the actual
-// divide loop, we must calculate the left-shift amount necessary to align
-// the MSB of the divisor with that of the dividend (If this shift is
-// negative, then the result is zero, and we early out). We also conjure a
-// bit mask of 1 to use in constructing the quotient, and initialize the
-// quotient to zero.
- ESTABLISH_FRAME
- clz r2, a
- tst b, b // detect divide-by-zero
- clz r3, b
- mov q, #0
- beq LOCAL_LABEL(return) // return 0 if b is zero.
- mov one, #1
- subs i, r3, r2
- blt LOCAL_LABEL(return) // return 0 if MSB(a) < MSB(b)
-
-LOCAL_LABEL(mainLoop):
-// This loop basically implements the following:
-//
-// do {
-// if (a >= b << i) {
-// a -= b << i;
-// q |= 1 << i;
-// if (a == 0) break;
-// }
-// } while (--i)
-//
-// Note that this does not perform the final iteration (i == 0); by doing it
-// this way, we can merge the two branches which is a substantial win for
-// such a tight loop on current ARM architectures.
- subs r, a, b, lsl i
- itt hs
- orrhs q, q,one, lsl i
- movhs a, r
- it ne
- subsne i, i, #1
- bhi LOCAL_LABEL(mainLoop)
-
-// Do the final test subtraction and update of quotient (i == 0), as it is
-// not performed in the main loop.
- subs r, a, b
- it hs
- orrhs q, #1
-
-LOCAL_LABEL(return):
-// Move the quotient to r0 and return.
- mov r0, q
- CLEAR_FRAME_AND_RETURN
+ cmp r1, #1
+ bcc LOCAL_LABEL(divby0)
+ JMPc(lr, eq)
+ cmp r0, r1
+ movcc r0, #0
+ JMPc(lr, cc)
+ /*
+ * Implement division using binary long division algorithm.
+ *
+ * r0 is the numerator, r1 the denominator.
+ *
+ * The code before JMP computes the correct shift I, so that
+ * r0 and (r1 << I) have the highest bit set in the same position.
+ * At the time of JMP, ip := .Ldiv0block - 12 * I.
+ * This depends on the fixed instruction size of block.
+ *
+ * block(shift) implements the test-and-update-quotient core.
+ * It assumes (r0 << shift) can be computed without overflow and
+ * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
+ */
+
+# ifdef __ARM_FEATURE_CLZ
+ clz ip, r0
+ clz r3, r1
+ /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+ sub r3, r3, ip
+ adr ip, LOCAL_LABEL(div0block)
+ sub ip, ip, r3, lsl #2
+ sub ip, ip, r3, lsl #3
+ mov r3, #0
+ bx ip
+# else
+ mov r2, r0
+ adr ip, LOCAL_LABEL(div0block)
+
+ lsr r3, r2, #16
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, ip, #(16 * 12)
+
+ lsr r3, r2, #8
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, ip, #(8 * 12)
+
+ lsr r3, r2, #4
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, #(4 * 12)
+
+ lsr r3, r2, #2
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, ip, #(2 * 12)
+
+ /* Last block, no need to update r2 or r3. */
+ cmp r1, r2, lsr #1
+ subls ip, ip, #(1 * 12)
+
+ mov r3, #0
+
+ JMP(ip)
+# endif
+
+#define IMM #
+
+#define block(shift) \
+ cmp r0, r1, lsl IMM shift; \
+ addhs r3, r3, IMM (1 << shift); \
+ subhs r0, r0, r1, lsl IMM shift
+
+ block(31)
+ block(30)
+ block(29)
+ block(28)
+ block(27)
+ block(26)
+ block(25)
+ block(24)
+ block(23)
+ block(22)
+ block(21)
+ block(20)
+ block(19)
+ block(18)
+ block(17)
+ block(16)
+ block(15)
+ block(14)
+ block(13)
+ block(12)
+ block(11)
+ block(10)
+ block(9)
+ block(8)
+ block(7)
+ block(6)
+ block(5)
+ block(4)
+ block(3)
+ block(2)
+ block(1)
+LOCAL_LABEL(div0block):
+ block(0)
+
+ mov r0, r3
+ JMP(lr)
+#endif /* __ARM_ARCH_EXT_IDIV__ */
+
+LOCAL_LABEL(divby0):
+ mov r0, #0
+#ifdef __ARM_EABI__
+ b __aeabi_idiv0
+#else
+ JMP(lr)
#endif
+
+END_COMPILERRT_FUNCTION(__udivsi3)
Modified: compiler-rt/trunk/lib/arm/umodsi3.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/umodsi3.S?rev=200001&r1=200000&r2=200001&view=diff
==============================================================================
--- compiler-rt/trunk/lib/arm/umodsi3.S (original)
+++ compiler-rt/trunk/lib/arm/umodsi3.S Fri Jan 24 07:43:35 2014
@@ -1,4 +1,4 @@
-/*===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===//
+/*===-- udivmodsi4.S - 32-bit unsigned integer modulus --------------------===//
*
* The LLVM Compiler Infrastructure
*
@@ -7,66 +7,144 @@
*
*===----------------------------------------------------------------------===//
*
- * This file implements the __umodsi3 (32-bit unsigned integer modulus)
- * function for the ARM architecture. A naive digit-by-digit computation is
- * employed for simplicity.
+ * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
+ * modulus) function for the ARM 32-bit architecture.
*
*===----------------------------------------------------------------------===*/
#include "../assembly.h"
-#define a r0
-#define b r1
-#define r r2
-#define i r3
+ .syntax unified
-.syntax unified
-.align 3
+#ifdef ARM_HAS_BX
+#define JMP(r) bx r
+#define JMPc(r,c) bx##c r
+#else
+#define JMP(r) mov pc, r
+#define JMPc(r,c) mov##c pc, r
+#endif
+
+ .text
+ .arm
+ .p2align 2
DEFINE_COMPILERRT_FUNCTION(__umodsi3)
#if __ARM_ARCH_EXT_IDIV__
tst r1, r1
- beq LOCAL_LABEL(divzero)
- udiv r2, r0, r1
- mls r0, r2, r1, r0
- bx lr
-LOCAL_LABEL(divzero):
- mov r0, #0
- bx lr
+ beq LOCAL_LABEL(divby0)
+ mov r3, r0
+ udiv r0, r3, r1
+ mls r1, r0, r1, r3
+ str r1, [r2]
+ bx lr
+#else
+ cmp r1, #1
+ bcc LOCAL_LABEL(divby0)
+ moveq r0, #0
+ JMPc(lr, eq)
+ cmp r0, r1
+ JMPc(lr, cc)
+ /*
+ * Implement division using binary long division algorithm.
+ *
+ * r0 is the numerator, r1 the denominator.
+ *
+ * The code before JMP computes the correct shift I, so that
+ * r0 and (r1 << I) have the highest bit set in the same position.
+ * At the time of JMP, ip := .Ldiv0block - 8 * I.
+ * This depends on the fixed instruction size of block.
+ *
+ * block(shift) implements the test-and-update-quotient core.
+ * It assumes (r0 << shift) can be computed without overflow and
+ * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
+ */
+
+# ifdef __ARM_FEATURE_CLZ
+ clz ip, r0
+ clz r3, r1
+ /* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+ sub r3, r3, ip
+ adr ip, LOCAL_LABEL(div0block)
+ sub ip, ip, r3, lsl #3
+ bx ip
+# else
+ mov r2, r0
+ adr ip, LOCAL_LABEL(div0block)
+
+ lsr r3, r2, #16
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, ip, #(16 * 8)
+
+ lsr r3, r2, #8
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, ip, #(8 * 8)
+
+ lsr r3, r2, #4
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, #(4 * 8)
+
+ lsr r3, r2, #2
+ cmp r3, r1
+ movhs r2, r3
+ subhs ip, ip, #(2 * 8)
+
+ /* Last block, no need to update r2 or r3. */
+ cmp r1, r2, lsr #1
+ subls ip, ip, #(1 * 8)
+
+ JMP(ip)
+# endif
+
+#define IMM #
+
+#define block(shift) \
+ cmp r0, r1, lsl IMM shift; \
+ subhs r0, r0, r1, lsl IMM shift
+
+ block(31)
+ block(30)
+ block(29)
+ block(28)
+ block(27)
+ block(26)
+ block(25)
+ block(24)
+ block(23)
+ block(22)
+ block(21)
+ block(20)
+ block(19)
+ block(18)
+ block(17)
+ block(16)
+ block(15)
+ block(14)
+ block(13)
+ block(12)
+ block(11)
+ block(10)
+ block(9)
+ block(8)
+ block(7)
+ block(6)
+ block(5)
+ block(4)
+ block(3)
+ block(2)
+ block(1)
+LOCAL_LABEL(div0block):
+ block(0)
+ JMP(lr)
+#endif /* __ARM_ARCH_EXT_IDIV__ */
+
+LOCAL_LABEL(divby0):
+ mov r0, #0
+#ifdef __ARM_EABI__
+ b __aeabi_idiv0
#else
-// We use a simple digit by digit algorithm; before we get into the actual
-// divide loop, we must calculate the left-shift amount necessary to align
-// the MSB of the divisor with that of the dividend.
- clz r2, a
- tst b, b // detect b == 0
- clz r3, b
- bxeq lr // return a if b == 0
- subs i, r3, r2
- bxlt lr // return a if MSB(a) < MSB(b)
-
-LOCAL_LABEL(mainLoop):
-// This loop basically implements the following:
-//
-// do {
-// if (a >= b << i) {
-// a -= b << i;
-// if (a == 0) break;
-// }
-// } while (--i)
-//
-// Note that this does not perform the final iteration (i == 0); by doing it
-// this way, we can merge the two branches which is a substantial win for
-// such a tight loop on current ARM architectures.
- subs r, a, b, lsl i
- it hs
- movhs a, r
- it ne
- subsne i, i, #1
- bhi LOCAL_LABEL(mainLoop)
-
-// Do the final test subtraction and update of remainder (i == 0), as it is
-// not performed in the main loop.
- subs r, a, b
- it hs
- movhs a, r
- bx lr
+ JMP(lr)
#endif
+
+END_COMPILERRT_FUNCTION(__umodsi3)
More information about the llvm-commits
mailing list