[compiler-rt] r200001 - Provide support for ARMv4, lacking bx and clz. Unroll the

Fri Jan 24 05:43:35 PST 2014

Author: joerg
Date: Fri Jan 24 07:43:35 2014
New Revision: 200001

URL: http://llvm.org/viewvc/llvm-project?rev=200001&view=rev
Log:
Provide support for ARMv4, lacking bx and clz. Unroll the
test-and-subtract loop and compute the initial block as address,
shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi.
Code written by Matt Thomas and Joerg Sonnenberger.

Differential Revision: http://llvm-reviews.chandlerc.com/D2595

Modified:
    compiler-rt/trunk/lib/arm/udivmodsi4.S
    compiler-rt/trunk/lib/arm/udivsi3.S
    compiler-rt/trunk/lib/arm/umodsi3.S

Modified: compiler-rt/trunk/lib/arm/udivmodsi4.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivmodsi4.S?rev=200001&r1=200000&r2=200001&view=diff
==============================================================================

--- compiler-rt/trunk/lib/arm/udivmodsi4.S (original)
+++ compiler-rt/trunk/lib/arm/udivmodsi4.S Fri Jan 24 07:43:35 2014
@@ -8,89 +8,161 @@
  *===----------------------------------------------------------------------===//
  *
  * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
- * modulus) function for the ARM architecture.  A naive digit-by-digit
- * computation is employed for simplicity.
+ * modulus) function for the ARM 32-bit architecture.
  *
  *===----------------------------------------------------------------------===*/
 
 #include "../assembly.h"
 
-#define ESTABLISH_FRAME    \
-    push   {r4, r7, lr}   ;\
-    add     r7,     sp, #4
-#define CLEAR_FRAME_AND_RETURN \
-    pop    {r4, r7, pc}
-    
-#define a r0
-#define b r1
-#define i r3
-#define r r4
-#define q ip
-#define one lr
+	.syntax unified
 
-.syntax unified
-.align 3
+#ifdef ARM_HAS_BX
+#define	JMP(r)	bx	r
+#else
+#define	JMP(r)	mov	pc, r
+#endif
+
+	.text
+	.arm
+	.p2align 2
 DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
 #if __ARM_ARCH_EXT_IDIV__
 	tst     r1, r1
-	beq     LOCAL_LABEL(divzero)
+	beq     LOCAL_LABEL(divby0)
 	mov 	r3, r0
 	udiv	r0, r3, r1
 	mls 	r1, r0, r1, r3
 	str 	r1, [r2]
 	bx  	lr
-LOCAL_LABEL(divzero):
-	mov     r0, #0
-	bx      lr
 #else
-//  We use a simple digit by digit algorithm; before we get into the actual 
-//  divide loop, we must calculate the left-shift amount necessary to align
-//  the MSB of the divisor with that of the dividend (If this shift is
-//  negative, then the result is zero, and we early out). We also conjure a
-//  bit mask of 1 to use in constructing the quotient, and initialize the
-//  quotient to zero.
-    ESTABLISH_FRAME
-    clz     r4,     a
-    tst     b,      b   // detect divide-by-zero
-    clz     r3,     b
-    mov     q,      #0
-    beq     LOCAL_LABEL(return)    // return 0 if b is zero.
-    mov     one,    #1
-    subs    i,      r3, r4
-    blt     LOCAL_LABEL(return)    // return 0 if MSB(a) < MSB(b)
-
-LOCAL_LABEL(mainLoop):
-//  This loop basically implements the following:
-//
-//  do {
-//      if (a >= b << i) {
-//          a -= b << i;
-//          q |= 1 << i;
-//          if (a == 0) break;
-//      }
-//  } while (--i)
-//
-//  Note that this does not perform the final iteration (i == 0); by doing it
-//  this way, we can merge the two branches which is a substantial win for
-//  such a tight loop on current ARM architectures.
-    subs    r,      a,  b, lsl i
-    itt hs
-    orrhs   q,      q,one, lsl i
-    movhs   a,      r
-    it ne
-    subsne  i,      i, #1
-    bhi     LOCAL_LABEL(mainLoop)
-
-//  Do the final test subtraction and update of quotient (i == 0), as it is
-//  not performed in the main loop.
-    subs    r,      a,  b
-    itt hs
-    orrhs   q,      #1
-    movhs   a,      r
-
-LOCAL_LABEL(return):
-//  Store the remainder, and move the quotient to r0, then return.
-    str     a,     [r2]
-    mov     r0,     q
-    CLEAR_FRAME_AND_RETURN
+	cmp	r1, #1
+	bcc	LOCAL_LABEL(divby0)
+	beq	LOCAL_LABEL(divby1)
+	cmp	r0, r1
+	bcc	LOCAL_LABEL(quotient0)
+	/*
+	 * Implement division using binary long division algorithm.
+	 *
+	 * r0 is the numerator, r1 the denominator.
+	 *
+	 * The code before JMP computes the correct shift I, so that
+	 * r0 and (r1 << I) have the highest bit set in the same position.
+	 * At the time of JMP, ip := .Ldiv0block - 12 * I.
+	 * This depends on the fixed instruction size of block.
+	 *
+	 * block(shift) implements the test-and-update-quotient core.
+	 * It assumes (r0 << shift) can be computed without overflow and
+	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
+	 */
+
+#  ifdef __ARM_FEATURE_CLZ
+	clz	ip, r0
+	clz	r3, r1
+	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+	sub	r3, r3, ip
+	adr	ip, LOCAL_LABEL(div0block)
+	sub	ip, ip, r3, lsl #2
+	sub	ip, ip, r3, lsl #3
+	mov	r3, #0
+	bx	ip
+#  else
+	str	r4, [sp, #-8]!
+
+	mov	r4, r0
+	adr	ip, LOCAL_LABEL(div0block)
+
+	lsr	r3, r4, #16
+	cmp	r3, r1
+	movhs	r4, r3
+	subhs	ip, ip, #(16 * 12)
+
+	lsr	r3, r4, #8
+	cmp	r3, r1
+	movhs	r4, r3
+	subhs	ip, ip, #(8 * 12)
+
+	lsr	r3, r4, #4
+	cmp	r3, r1
+	movhs	r4, r3
+	subhs	ip, #(4 * 12)
+
+	lsr	r3, r4, #2
+	cmp	r3, r1
+	movhs	r4, r3
+	subhs	ip, ip, #(2 * 12)
+
+	/* Last block, no need to update r3 or r4. */
+	cmp	r1, r4, lsr #1
+	subls	ip, ip, #(1 * 12)
+
+	ldr	r4, [sp], #8	/* restore r4, we are done with it. */
+	mov	r3, #0
+
+	JMP(ip)
+#  endif
+
+#define	IMM	#
+
+#define block(shift) \
+	cmp	r0, r1, lsl IMM shift; \
+	addhs	r3, r3, IMM (1 << shift); \
+	subhs	r0, r0, r1, lsl IMM shift
+
+	block(31)
+	block(30)
+	block(29)
+	block(28)
+	block(27)
+	block(26)
+	block(25)
+	block(24)
+	block(23)
+	block(22)
+	block(21)
+	block(20)
+	block(19)
+	block(18)
+	block(17)
+	block(16)
+	block(15)
+	block(14)
+	block(13)
+	block(12)
+	block(11)
+	block(10)
+	block(9)
+	block(8)
+	block(7)
+	block(6)
+	block(5)
+	block(4)
+	block(3)
+	block(2)
+	block(1)
+LOCAL_LABEL(div0block):
+	block(0)
+
+	str	r0, [r2]
+	mov	r0, r3
+	JMP(lr)
+
+LOCAL_LABEL(quotient0):
+	str	r0, [r2]
+	mov	r0, #0
+	JMP(lr)
+
+LOCAL_LABEL(divby1):
+	mov	r3, #0
+	str	r3, [r2]
+	JMP(lr)
+#endif /* __ARM_ARCH_EXT_IDIV__ */
+
+LOCAL_LABEL(divby0):
+	mov	r0, #0
+#ifdef __ARM_EABI__
+	b	__aeabi_idiv0
+#else
+	JMP(lr)
 #endif
+
+END_COMPILERRT_FUNCTION(__udivmodsi4)

Modified: compiler-rt/trunk/lib/arm/udivsi3.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivsi3.S?rev=200001&r1=200000&r2=200001&view=diff
==============================================================================
--- compiler-rt/trunk/lib/arm/udivsi3.S (original)
+++ compiler-rt/trunk/lib/arm/udivsi3.S Fri Jan 24 07:43:35 2014
@@ -1,4 +1,4 @@
-/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
+/*===-- udivmodsi4.S - 32-bit unsigned integer divide ---------------------===//
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -7,87 +7,151 @@
  *
  *===----------------------------------------------------------------------===//
  *
- * This file implements the __udivsi3 (32-bit unsigned integer divide) 
- * function for the ARM architecture.  A naive digit-by-digit computation is
- * employed for simplicity.
+ * This file implements the __udivsi3 (32-bit unsigned integer divide)
+ * function for the ARM 32-bit architecture.
  *
  *===----------------------------------------------------------------------===*/
 
 #include "../assembly.h"
 
-#define ESTABLISH_FRAME \
-    push   {r7, lr}    ;\
-    mov     r7,     sp
-#define CLEAR_FRAME_AND_RETURN \
-    pop    {r7, pc}
-
-#define a r0
-#define b r1
-#define r r2
-#define i r3
-#define q ip
-#define one lr
-
-.syntax unified
-.align 3
-// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine.
+	.syntax unified
+
+#ifdef ARM_HAS_BX
+#define	JMP(r)		bx	r
+#define	JMPc(r,c)	bx##c	r
+#else
+#define	JMP(r)		mov	pc, r
+#define	JMPc(r,c)	mov##c	pc, r
+#endif
+
+	.text
+	.arm
+	.p2align 2
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
 DEFINE_COMPILERRT_FUNCTION(__udivsi3)
 #if __ARM_ARCH_EXT_IDIV__
-	tst	r1,r1
-	beq	LOCAL_LABEL(divzero)
-	udiv	r0, r0, r1
+	tst     r1, r1
+	beq     LOCAL_LABEL(divby0)
+	mov 	r3, r0
+	udiv	r0, r3, r1
+	mls 	r1, r0, r1, r3
 	bx  	lr
-	LOCAL_LABEL(divzero):
-	mov	r0,#0
-	bx	lr
 #else
-//  We use a simple digit by digit algorithm; before we get into the actual 
-//  divide loop, we must calculate the left-shift amount necessary to align
-//  the MSB of the divisor with that of the dividend (If this shift is
-//  negative, then the result is zero, and we early out). We also conjure a
-//  bit mask of 1 to use in constructing the quotient, and initialize the
-//  quotient to zero.
-    ESTABLISH_FRAME
-    clz     r2,     a
-    tst     b,      b   // detect divide-by-zero
-    clz     r3,     b
-    mov     q,      #0
-    beq     LOCAL_LABEL(return)    // return 0 if b is zero.
-    mov     one,    #1
-    subs    i,      r3, r2
-    blt     LOCAL_LABEL(return)    // return 0 if MSB(a) < MSB(b)
-
-LOCAL_LABEL(mainLoop):
-//  This loop basically implements the following:
-//
-//  do {
-//      if (a >= b << i) {
-//          a -= b << i;
-//          q |= 1 << i;
-//          if (a == 0) break;
-//      }
-//  } while (--i)
-//
-//  Note that this does not perform the final iteration (i == 0); by doing it
-//  this way, we can merge the two branches which is a substantial win for
-//  such a tight loop on current ARM architectures.
-    subs    r,      a,  b, lsl i
-    itt hs
-    orrhs   q,      q,one, lsl i
-    movhs   a,      r
-    it ne
-    subsne  i,      i, #1
-    bhi     LOCAL_LABEL(mainLoop)
-
-//  Do the final test subtraction and update of quotient (i == 0), as it is
-//  not performed in the main loop.
-    subs    r,      a,  b
-    it hs
-    orrhs   q,      #1
-
-LOCAL_LABEL(return):
-//  Move the quotient to r0 and return.
-    mov     r0,     q
-    CLEAR_FRAME_AND_RETURN
+	cmp	r1, #1
+	bcc	LOCAL_LABEL(divby0)
+	JMPc(lr, eq)
+	cmp	r0, r1
+	movcc	r0, #0
+	JMPc(lr, cc)
+	/*
+	 * Implement division using binary long division algorithm.
+	 *
+	 * r0 is the numerator, r1 the denominator.
+	 *
+	 * The code before JMP computes the correct shift I, so that
+	 * r0 and (r1 << I) have the highest bit set in the same position.
+	 * At the time of JMP, ip := .Ldiv0block - 12 * I.
+	 * This depends on the fixed instruction size of block.
+	 *
+	 * block(shift) implements the test-and-update-quotient core.
+	 * It assumes (r0 << shift) can be computed without overflow and
+	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
+	 */
+
+#  ifdef __ARM_FEATURE_CLZ
+	clz	ip, r0
+	clz	r3, r1
+	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+	sub	r3, r3, ip
+	adr	ip, LOCAL_LABEL(div0block)
+	sub	ip, ip, r3, lsl #2
+	sub	ip, ip, r3, lsl #3
+	mov	r3, #0
+	bx	ip
+#  else
+	mov	r2, r0
+	adr	ip, LOCAL_LABEL(div0block)
+
+	lsr	r3, r2, #16
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, ip, #(16 * 12)
+
+	lsr	r3, r2, #8
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, ip, #(8 * 12)
+
+	lsr	r3, r2, #4
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, #(4 * 12)
+
+	lsr	r3, r2, #2
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, ip, #(2 * 12)
+
+	/* Last block, no need to update r2 or r3. */
+	cmp	r1, r2, lsr #1
+	subls	ip, ip, #(1 * 12)
+
+	mov	r3, #0
+
+	JMP(ip)
+#  endif
+
+#define	IMM	#
+
+#define block(shift) \
+	cmp	r0, r1, lsl IMM shift; \
+	addhs	r3, r3, IMM (1 << shift); \
+	subhs	r0, r0, r1, lsl IMM shift
+
+	block(31)
+	block(30)
+	block(29)
+	block(28)
+	block(27)
+	block(26)
+	block(25)
+	block(24)
+	block(23)
+	block(22)
+	block(21)
+	block(20)
+	block(19)
+	block(18)
+	block(17)
+	block(16)
+	block(15)
+	block(14)
+	block(13)
+	block(12)
+	block(11)
+	block(10)
+	block(9)
+	block(8)
+	block(7)
+	block(6)
+	block(5)
+	block(4)
+	block(3)
+	block(2)
+	block(1)
+LOCAL_LABEL(div0block):
+	block(0)
+
+	mov	r0, r3
+	JMP(lr)
+#endif /* __ARM_ARCH_EXT_IDIV__ */
+
+LOCAL_LABEL(divby0):
+	mov	r0, #0
+#ifdef __ARM_EABI__
+	b	__aeabi_idiv0
+#else
+	JMP(lr)
 #endif
+
+END_COMPILERRT_FUNCTION(__udivsi3)

Modified: compiler-rt/trunk/lib/arm/umodsi3.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/umodsi3.S?rev=200001&r1=200000&r2=200001&view=diff
==============================================================================
--- compiler-rt/trunk/lib/arm/umodsi3.S (original)
+++ compiler-rt/trunk/lib/arm/umodsi3.S Fri Jan 24 07:43:35 2014
@@ -1,4 +1,4 @@
-/*===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===//
+/*===-- udivmodsi4.S - 32-bit unsigned integer modulus --------------------===//
  *
  *                     The LLVM Compiler Infrastructure
  *
@@ -7,66 +7,144 @@
  *
  *===----------------------------------------------------------------------===//
  *
- * This file implements the __umodsi3 (32-bit unsigned integer modulus) 
- * function for the ARM architecture.  A naive digit-by-digit computation is
- * employed for simplicity.
+ * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
+ * modulus) function for the ARM 32-bit architecture.
  *
  *===----------------------------------------------------------------------===*/
 
 #include "../assembly.h"
 
-#define a r0
-#define b r1
-#define r r2
-#define i r3
+	.syntax unified
 
-.syntax unified
-.align 3
+#ifdef ARM_HAS_BX
+#define	JMP(r)		bx	r
+#define	JMPc(r,c)	bx##c	r
+#else
+#define	JMP(r)		mov	pc, r
+#define	JMPc(r,c)	mov##c	pc, r
+#endif
+
+	.text
+	.arm
+	.p2align 2
 DEFINE_COMPILERRT_FUNCTION(__umodsi3)
 #if __ARM_ARCH_EXT_IDIV__
 	tst     r1, r1
-	beq     LOCAL_LABEL(divzero)
-	udiv	r2, r0, r1
-	mls     r0, r2, r1, r0
-	bx      lr
-LOCAL_LABEL(divzero):
-	mov     r0, #0
-	bx      lr
+	beq     LOCAL_LABEL(divby0)
+	mov 	r3, r0
+	udiv	r0, r3, r1
+	mls 	r1, r0, r1, r3
+	str 	r1, [r2]
+	bx  	lr
+#else
+	cmp	r1, #1
+	bcc	LOCAL_LABEL(divby0)
+	moveq	r0, #0
+	JMPc(lr, eq)
+	cmp	r0, r1
+	JMPc(lr, cc)
+	/*
+	 * Implement division using binary long division algorithm.
+	 *
+	 * r0 is the numerator, r1 the denominator.
+	 *
+	 * The code before JMP computes the correct shift I, so that
+	 * r0 and (r1 << I) have the highest bit set in the same position.
+	 * At the time of JMP, ip := .Ldiv0block - 8 * I.
+	 * This depends on the fixed instruction size of block.
+	 *
+	 * block(shift) implements the test-and-update-quotient core.
+	 * It assumes (r0 << shift) can be computed without overflow and
+	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
+	 */
+
+#  ifdef __ARM_FEATURE_CLZ
+	clz	ip, r0
+	clz	r3, r1
+	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+	sub	r3, r3, ip
+	adr	ip, LOCAL_LABEL(div0block)
+	sub	ip, ip, r3, lsl #3
+	bx	ip
+#  else
+	mov	r2, r0
+	adr	ip, LOCAL_LABEL(div0block)
+
+	lsr	r3, r2, #16
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, ip, #(16 * 8)
+
+	lsr	r3, r2, #8
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, ip, #(8 * 8)
+
+	lsr	r3, r2, #4
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, #(4 * 8)
+
+	lsr	r3, r2, #2
+	cmp	r3, r1
+	movhs	r2, r3
+	subhs	ip, ip, #(2 * 8)
+
+	/* Last block, no need to update r2 or r3. */
+	cmp	r1, r2, lsr #1
+	subls	ip, ip, #(1 * 8)
+
+	JMP(ip)
+#  endif
+
+#define	IMM	#
+
+#define block(shift) \
+	cmp	r0, r1, lsl IMM shift; \
+	subhs	r0, r0, r1, lsl IMM shift
+
+	block(31)
+	block(30)
+	block(29)
+	block(28)
+	block(27)
+	block(26)
+	block(25)
+	block(24)
+	block(23)
+	block(22)
+	block(21)
+	block(20)
+	block(19)
+	block(18)
+	block(17)
+	block(16)
+	block(15)
+	block(14)
+	block(13)
+	block(12)
+	block(11)
+	block(10)
+	block(9)
+	block(8)
+	block(7)
+	block(6)
+	block(5)
+	block(4)
+	block(3)
+	block(2)
+	block(1)
+LOCAL_LABEL(div0block):
+	block(0)
+	JMP(lr)
+#endif /* __ARM_ARCH_EXT_IDIV__ */
+
+LOCAL_LABEL(divby0):
+	mov	r0, #0
+#ifdef __ARM_EABI__
+	b	__aeabi_idiv0
 #else
-//  We use a simple digit by digit algorithm; before we get into the actual 
-//  divide loop, we must calculate the left-shift amount necessary to align
-//  the MSB of the divisor with that of the dividend.
-    clz     r2,     a
-    tst     b,      b       // detect b == 0
-    clz     r3,     b
-    bxeq    lr              // return a if b == 0
-    subs    i,      r3, r2
-    bxlt    lr              // return a if MSB(a) < MSB(b)
-
-LOCAL_LABEL(mainLoop):
-//  This loop basically implements the following:
-//
-//  do {
-//      if (a >= b << i) {
-//          a -= b << i;
-//          if (a == 0) break;
-//      }
-//  } while (--i)
-//
-//  Note that this does not perform the final iteration (i == 0); by doing it
-//  this way, we can merge the two branches which is a substantial win for
-//  such a tight loop on current ARM architectures.
-    subs    r,      a,  b, lsl i
-    it hs
-    movhs   a,      r
-    it ne
-    subsne  i,      i, #1
-    bhi     LOCAL_LABEL(mainLoop)
-
-//  Do the final test subtraction and update of remainder (i == 0), as it is
-//  not performed in the main loop.
-    subs    r,      a,  b
-    it hs
-    movhs   a,      r
-    bx      lr
+	JMP(lr)
 #endif
+
+END_COMPILERRT_FUNCTION(__umodsi3)