[compiler-rt] r200001 - Provide support for ARMv4, lacking bx and clz. Unroll the

Fri Jan 24 08:22:49 PST 2014

This has broken all of our buildbots because we also build this code for armv7m, which is Thumb and where the “.arm” directive is not supported.  Can you fix it right away and/or revert until it gets sorted out?

On Jan 24, 2014, at 5:43 AM, Joerg Sonnenberger <joerg at bec.de> wrote:

> Author: joerg
> Date: Fri Jan 24 07:43:35 2014
> New Revision: 200001
> 
> URL: http://llvm.org/viewvc/llvm-project?rev=200001&view=rev
> Log:
> Provide support for ARMv4, lacking bx and clz. Unroll the
> test-and-subtract loop and compute the initial block as address,
> shaving off between 5% and 10% on Cortex A9 and 30%+ a Raspberry Pi.
> Code written by Matt Thomas and Joerg Sonnenberger.
> 
> Differential Revision: http://llvm-reviews.chandlerc.com/D2595
> 
> Modified:
>    compiler-rt/trunk/lib/arm/udivmodsi4.S
>    compiler-rt/trunk/lib/arm/udivsi3.S
>    compiler-rt/trunk/lib/arm/umodsi3.S
> 
> Modified: compiler-rt/trunk/lib/arm/udivmodsi4.S
> URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivmodsi4.S?rev=200001&r1=200000&r2=200001&view=diff
> ==============================================================================
> --- compiler-rt/trunk/lib/arm/udivmodsi4.S (original)
> +++ compiler-rt/trunk/lib/arm/udivmodsi4.S Fri Jan 24 07:43:35 2014
> @@ -8,89 +8,161 @@
>  *===----------------------------------------------------------------------===//
>  *
>  * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
> - * modulus) function for the ARM architecture.  A naive digit-by-digit
> - * computation is employed for simplicity.
> + * modulus) function for the ARM 32-bit architecture.
>  *
>  *===----------------------------------------------------------------------===*/
> 
> #include "../assembly.h"
> 
> -#define ESTABLISH_FRAME    \
> -    push   {r4, r7, lr}   ;\
> -    add     r7,     sp, #4
> -#define CLEAR_FRAME_AND_RETURN \
> -    pop    {r4, r7, pc}
> -    
> -#define a r0
> -#define b r1
> -#define i r3
> -#define r r4
> -#define q ip
> -#define one lr
> +	.syntax unified
> 
> -.syntax unified
> -.align 3
> +#ifdef ARM_HAS_BX
> +#define	JMP(r)	bx	r
> +#else
> +#define	JMP(r)	mov	pc, r
> +#endif
> +
> +	.text
> +	.arm
> +	.p2align 2
> DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
> #if __ARM_ARCH_EXT_IDIV__
> 	tst     r1, r1
> -	beq     LOCAL_LABEL(divzero)
> +	beq     LOCAL_LABEL(divby0)
> 	mov 	r3, r0
> 	udiv	r0, r3, r1
> 	mls 	r1, r0, r1, r3
> 	str 	r1, [r2]
> 	bx  	lr
> -LOCAL_LABEL(divzero):
> -	mov     r0, #0
> -	bx      lr
> #else
> -//  We use a simple digit by digit algorithm; before we get into the actual 
> -//  divide loop, we must calculate the left-shift amount necessary to align
> -//  the MSB of the divisor with that of the dividend (If this shift is
> -//  negative, then the result is zero, and we early out). We also conjure a
> -//  bit mask of 1 to use in constructing the quotient, and initialize the
> -//  quotient to zero.
> -    ESTABLISH_FRAME
> -    clz     r4,     a
> -    tst     b,      b   // detect divide-by-zero
> -    clz     r3,     b
> -    mov     q,      #0
> -    beq     LOCAL_LABEL(return)    // return 0 if b is zero.
> -    mov     one,    #1
> -    subs    i,      r3, r4
> -    blt     LOCAL_LABEL(return)    // return 0 if MSB(a) < MSB(b)
> -
> -LOCAL_LABEL(mainLoop):
> -//  This loop basically implements the following:
> -//
> -//  do {
> -//      if (a >= b << i) {
> -//          a -= b << i;
> -//          q |= 1 << i;
> -//          if (a == 0) break;
> -//      }
> -//  } while (--i)
> -//
> -//  Note that this does not perform the final iteration (i == 0); by doing it
> -//  this way, we can merge the two branches which is a substantial win for
> -//  such a tight loop on current ARM architectures.
> -    subs    r,      a,  b, lsl i
> -    itt hs
> -    orrhs   q,      q,one, lsl i
> -    movhs   a,      r
> -    it ne
> -    subsne  i,      i, #1
> -    bhi     LOCAL_LABEL(mainLoop)
> -
> -//  Do the final test subtraction and update of quotient (i == 0), as it is
> -//  not performed in the main loop.
> -    subs    r,      a,  b
> -    itt hs
> -    orrhs   q,      #1
> -    movhs   a,      r
> -
> -LOCAL_LABEL(return):
> -//  Store the remainder, and move the quotient to r0, then return.
> -    str     a,     [r2]
> -    mov     r0,     q
> -    CLEAR_FRAME_AND_RETURN
> +	cmp	r1, #1
> +	bcc	LOCAL_LABEL(divby0)
> +	beq	LOCAL_LABEL(divby1)
> +	cmp	r0, r1
> +	bcc	LOCAL_LABEL(quotient0)
> +	/*
> +	 * Implement division using binary long division algorithm.
> +	 *
> +	 * r0 is the numerator, r1 the denominator.
> +	 *
> +	 * The code before JMP computes the correct shift I, so that
> +	 * r0 and (r1 << I) have the highest bit set in the same position.
> +	 * At the time of JMP, ip := .Ldiv0block - 12 * I.
> +	 * This depends on the fixed instruction size of block.
> +	 *
> +	 * block(shift) implements the test-and-update-quotient core.
> +	 * It assumes (r0 << shift) can be computed without overflow and
> +	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
> +	 */
> +
> +#  ifdef __ARM_FEATURE_CLZ
> +	clz	ip, r0
> +	clz	r3, r1
> +	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
> +	sub	r3, r3, ip
> +	adr	ip, LOCAL_LABEL(div0block)
> +	sub	ip, ip, r3, lsl #2
> +	sub	ip, ip, r3, lsl #3
> +	mov	r3, #0
> +	bx	ip
> +#  else
> +	str	r4, [sp, #-8]!
> +
> +	mov	r4, r0
> +	adr	ip, LOCAL_LABEL(div0block)
> +
> +	lsr	r3, r4, #16
> +	cmp	r3, r1
> +	movhs	r4, r3
> +	subhs	ip, ip, #(16 * 12)
> +
> +	lsr	r3, r4, #8
> +	cmp	r3, r1
> +	movhs	r4, r3
> +	subhs	ip, ip, #(8 * 12)
> +
> +	lsr	r3, r4, #4
> +	cmp	r3, r1
> +	movhs	r4, r3
> +	subhs	ip, #(4 * 12)
> +
> +	lsr	r3, r4, #2
> +	cmp	r3, r1
> +	movhs	r4, r3
> +	subhs	ip, ip, #(2 * 12)
> +
> +	/* Last block, no need to update r3 or r4. */
> +	cmp	r1, r4, lsr #1
> +	subls	ip, ip, #(1 * 12)
> +
> +	ldr	r4, [sp], #8	/* restore r4, we are done with it. */
> +	mov	r3, #0
> +
> +	JMP(ip)
> +#  endif
> +
> +#define	IMM	#
> +
> +#define block(shift) \
> +	cmp	r0, r1, lsl IMM shift; \
> +	addhs	r3, r3, IMM (1 << shift); \
> +	subhs	r0, r0, r1, lsl IMM shift
> +
> +	block(31)
> +	block(30)
> +	block(29)
> +	block(28)
> +	block(27)
> +	block(26)
> +	block(25)
> +	block(24)
> +	block(23)
> +	block(22)
> +	block(21)
> +	block(20)
> +	block(19)
> +	block(18)
> +	block(17)
> +	block(16)
> +	block(15)
> +	block(14)
> +	block(13)
> +	block(12)
> +	block(11)
> +	block(10)
> +	block(9)
> +	block(8)
> +	block(7)
> +	block(6)
> +	block(5)
> +	block(4)
> +	block(3)
> +	block(2)
> +	block(1)
> +LOCAL_LABEL(div0block):
> +	block(0)
> +
> +	str	r0, [r2]
> +	mov	r0, r3
> +	JMP(lr)
> +
> +LOCAL_LABEL(quotient0):
> +	str	r0, [r2]
> +	mov	r0, #0
> +	JMP(lr)
> +
> +LOCAL_LABEL(divby1):
> +	mov	r3, #0
> +	str	r3, [r2]
> +	JMP(lr)
> +#endif /* __ARM_ARCH_EXT_IDIV__ */
> +
> +LOCAL_LABEL(divby0):
> +	mov	r0, #0
> +#ifdef __ARM_EABI__
> +	b	__aeabi_idiv0
> +#else
> +	JMP(lr)
> #endif
> +
> +END_COMPILERRT_FUNCTION(__udivmodsi4)
> 
> Modified: compiler-rt/trunk/lib/arm/udivsi3.S
> URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/udivsi3.S?rev=200001&r1=200000&r2=200001&view=diff
> ==============================================================================
> --- compiler-rt/trunk/lib/arm/udivsi3.S (original)
> +++ compiler-rt/trunk/lib/arm/udivsi3.S Fri Jan 24 07:43:35 2014
> @@ -1,4 +1,4 @@
> -/*===-- udivsi3.S - 32-bit unsigned integer divide ------------------------===//
> +/*===-- udivmodsi4.S - 32-bit unsigned integer divide ---------------------===//
>  *
>  *                     The LLVM Compiler Infrastructure
>  *
> @@ -7,87 +7,151 @@
>  *
>  *===----------------------------------------------------------------------===//
>  *
> - * This file implements the __udivsi3 (32-bit unsigned integer divide) 
> - * function for the ARM architecture.  A naive digit-by-digit computation is
> - * employed for simplicity.
> + * This file implements the __udivsi3 (32-bit unsigned integer divide)
> + * function for the ARM 32-bit architecture.
>  *
>  *===----------------------------------------------------------------------===*/
> 
> #include "../assembly.h"
> 
> -#define ESTABLISH_FRAME \
> -    push   {r7, lr}    ;\
> -    mov     r7,     sp
> -#define CLEAR_FRAME_AND_RETURN \
> -    pop    {r7, pc}
> -
> -#define a r0
> -#define b r1
> -#define r r2
> -#define i r3
> -#define q ip
> -#define one lr
> -
> -.syntax unified
> -.align 3
> -// Ok, APCS and AAPCS agree on 32 bit args, so it's safe to use the same routine.
> +	.syntax unified
> +
> +#ifdef ARM_HAS_BX
> +#define	JMP(r)		bx	r
> +#define	JMPc(r,c)	bx##c	r
> +#else
> +#define	JMP(r)		mov	pc, r
> +#define	JMPc(r,c)	mov##c	pc, r
> +#endif
> +
> +	.text
> +	.arm
> +	.p2align 2
> DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
> DEFINE_COMPILERRT_FUNCTION(__udivsi3)
> #if __ARM_ARCH_EXT_IDIV__
> -	tst	r1,r1
> -	beq	LOCAL_LABEL(divzero)
> -	udiv	r0, r0, r1
> +	tst     r1, r1
> +	beq     LOCAL_LABEL(divby0)
> +	mov 	r3, r0
> +	udiv	r0, r3, r1
> +	mls 	r1, r0, r1, r3
> 	bx  	lr
> -	LOCAL_LABEL(divzero):
> -	mov	r0,#0
> -	bx	lr
> #else
> -//  We use a simple digit by digit algorithm; before we get into the actual 
> -//  divide loop, we must calculate the left-shift amount necessary to align
> -//  the MSB of the divisor with that of the dividend (If this shift is
> -//  negative, then the result is zero, and we early out). We also conjure a
> -//  bit mask of 1 to use in constructing the quotient, and initialize the
> -//  quotient to zero.
> -    ESTABLISH_FRAME
> -    clz     r2,     a
> -    tst     b,      b   // detect divide-by-zero
> -    clz     r3,     b
> -    mov     q,      #0
> -    beq     LOCAL_LABEL(return)    // return 0 if b is zero.
> -    mov     one,    #1
> -    subs    i,      r3, r2
> -    blt     LOCAL_LABEL(return)    // return 0 if MSB(a) < MSB(b)
> -
> -LOCAL_LABEL(mainLoop):
> -//  This loop basically implements the following:
> -//
> -//  do {
> -//      if (a >= b << i) {
> -//          a -= b << i;
> -//          q |= 1 << i;
> -//          if (a == 0) break;
> -//      }
> -//  } while (--i)
> -//
> -//  Note that this does not perform the final iteration (i == 0); by doing it
> -//  this way, we can merge the two branches which is a substantial win for
> -//  such a tight loop on current ARM architectures.
> -    subs    r,      a,  b, lsl i
> -    itt hs
> -    orrhs   q,      q,one, lsl i
> -    movhs   a,      r
> -    it ne
> -    subsne  i,      i, #1
> -    bhi     LOCAL_LABEL(mainLoop)
> -
> -//  Do the final test subtraction and update of quotient (i == 0), as it is
> -//  not performed in the main loop.
> -    subs    r,      a,  b
> -    it hs
> -    orrhs   q,      #1
> -
> -LOCAL_LABEL(return):
> -//  Move the quotient to r0 and return.
> -    mov     r0,     q
> -    CLEAR_FRAME_AND_RETURN
> +	cmp	r1, #1
> +	bcc	LOCAL_LABEL(divby0)
> +	JMPc(lr, eq)
> +	cmp	r0, r1
> +	movcc	r0, #0
> +	JMPc(lr, cc)
> +	/*
> +	 * Implement division using binary long division algorithm.
> +	 *
> +	 * r0 is the numerator, r1 the denominator.
> +	 *
> +	 * The code before JMP computes the correct shift I, so that
> +	 * r0 and (r1 << I) have the highest bit set in the same position.
> +	 * At the time of JMP, ip := .Ldiv0block - 12 * I.
> +	 * This depends on the fixed instruction size of block.
> +	 *
> +	 * block(shift) implements the test-and-update-quotient core.
> +	 * It assumes (r0 << shift) can be computed without overflow and
> +	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
> +	 */
> +
> +#  ifdef __ARM_FEATURE_CLZ
> +	clz	ip, r0
> +	clz	r3, r1
> +	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
> +	sub	r3, r3, ip
> +	adr	ip, LOCAL_LABEL(div0block)
> +	sub	ip, ip, r3, lsl #2
> +	sub	ip, ip, r3, lsl #3
> +	mov	r3, #0
> +	bx	ip
> +#  else
> +	mov	r2, r0
> +	adr	ip, LOCAL_LABEL(div0block)
> +
> +	lsr	r3, r2, #16
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, ip, #(16 * 12)
> +
> +	lsr	r3, r2, #8
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, ip, #(8 * 12)
> +
> +	lsr	r3, r2, #4
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, #(4 * 12)
> +
> +	lsr	r3, r2, #2
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, ip, #(2 * 12)
> +
> +	/* Last block, no need to update r2 or r3. */
> +	cmp	r1, r2, lsr #1
> +	subls	ip, ip, #(1 * 12)
> +
> +	mov	r3, #0
> +
> +	JMP(ip)
> +#  endif
> +
> +#define	IMM	#
> +
> +#define block(shift) \
> +	cmp	r0, r1, lsl IMM shift; \
> +	addhs	r3, r3, IMM (1 << shift); \
> +	subhs	r0, r0, r1, lsl IMM shift
> +
> +	block(31)
> +	block(30)
> +	block(29)
> +	block(28)
> +	block(27)
> +	block(26)
> +	block(25)
> +	block(24)
> +	block(23)
> +	block(22)
> +	block(21)
> +	block(20)
> +	block(19)
> +	block(18)
> +	block(17)
> +	block(16)
> +	block(15)
> +	block(14)
> +	block(13)
> +	block(12)
> +	block(11)
> +	block(10)
> +	block(9)
> +	block(8)
> +	block(7)
> +	block(6)
> +	block(5)
> +	block(4)
> +	block(3)
> +	block(2)
> +	block(1)
> +LOCAL_LABEL(div0block):
> +	block(0)
> +
> +	mov	r0, r3
> +	JMP(lr)
> +#endif /* __ARM_ARCH_EXT_IDIV__ */
> +
> +LOCAL_LABEL(divby0):
> +	mov	r0, #0
> +#ifdef __ARM_EABI__
> +	b	__aeabi_idiv0
> +#else
> +	JMP(lr)
> #endif
> +
> +END_COMPILERRT_FUNCTION(__udivsi3)
> 
> Modified: compiler-rt/trunk/lib/arm/umodsi3.S
> URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/arm/umodsi3.S?rev=200001&r1=200000&r2=200001&view=diff
> ==============================================================================
> --- compiler-rt/trunk/lib/arm/umodsi3.S (original)
> +++ compiler-rt/trunk/lib/arm/umodsi3.S Fri Jan 24 07:43:35 2014
> @@ -1,4 +1,4 @@
> -/*===-- umodsi3.S - 32-bit unsigned integer modulus -----------------------===//
> +/*===-- udivmodsi4.S - 32-bit unsigned integer modulus --------------------===//
>  *
>  *                     The LLVM Compiler Infrastructure
>  *
> @@ -7,66 +7,144 @@
>  *
>  *===----------------------------------------------------------------------===//
>  *
> - * This file implements the __umodsi3 (32-bit unsigned integer modulus) 
> - * function for the ARM architecture.  A naive digit-by-digit computation is
> - * employed for simplicity.
> + * This file implements the __udivmodsi4 (32-bit unsigned integer divide and
> + * modulus) function for the ARM 32-bit architecture.
>  *
>  *===----------------------------------------------------------------------===*/
> 
> #include "../assembly.h"
> 
> -#define a r0
> -#define b r1
> -#define r r2
> -#define i r3
> +	.syntax unified
> 
> -.syntax unified
> -.align 3
> +#ifdef ARM_HAS_BX
> +#define	JMP(r)		bx	r
> +#define	JMPc(r,c)	bx##c	r
> +#else
> +#define	JMP(r)		mov	pc, r
> +#define	JMPc(r,c)	mov##c	pc, r
> +#endif
> +
> +	.text
> +	.arm
> +	.p2align 2
> DEFINE_COMPILERRT_FUNCTION(__umodsi3)
> #if __ARM_ARCH_EXT_IDIV__
> 	tst     r1, r1
> -	beq     LOCAL_LABEL(divzero)
> -	udiv	r2, r0, r1
> -	mls     r0, r2, r1, r0
> -	bx      lr
> -LOCAL_LABEL(divzero):
> -	mov     r0, #0
> -	bx      lr
> +	beq     LOCAL_LABEL(divby0)
> +	mov 	r3, r0
> +	udiv	r0, r3, r1
> +	mls 	r1, r0, r1, r3
> +	str 	r1, [r2]
> +	bx  	lr
> +#else
> +	cmp	r1, #1
> +	bcc	LOCAL_LABEL(divby0)
> +	moveq	r0, #0
> +	JMPc(lr, eq)
> +	cmp	r0, r1
> +	JMPc(lr, cc)
> +	/*
> +	 * Implement division using binary long division algorithm.
> +	 *
> +	 * r0 is the numerator, r1 the denominator.
> +	 *
> +	 * The code before JMP computes the correct shift I, so that
> +	 * r0 and (r1 << I) have the highest bit set in the same position.
> +	 * At the time of JMP, ip := .Ldiv0block - 8 * I.
> +	 * This depends on the fixed instruction size of block.
> +	 *
> +	 * block(shift) implements the test-and-update-quotient core.
> +	 * It assumes (r0 << shift) can be computed without overflow and
> +	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
> +	 */
> +
> +#  ifdef __ARM_FEATURE_CLZ
> +	clz	ip, r0
> +	clz	r3, r1
> +	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
> +	sub	r3, r3, ip
> +	adr	ip, LOCAL_LABEL(div0block)
> +	sub	ip, ip, r3, lsl #3
> +	bx	ip
> +#  else
> +	mov	r2, r0
> +	adr	ip, LOCAL_LABEL(div0block)
> +
> +	lsr	r3, r2, #16
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, ip, #(16 * 8)
> +
> +	lsr	r3, r2, #8
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, ip, #(8 * 8)
> +
> +	lsr	r3, r2, #4
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, #(4 * 8)
> +
> +	lsr	r3, r2, #2
> +	cmp	r3, r1
> +	movhs	r2, r3
> +	subhs	ip, ip, #(2 * 8)
> +
> +	/* Last block, no need to update r2 or r3. */
> +	cmp	r1, r2, lsr #1
> +	subls	ip, ip, #(1 * 8)
> +
> +	JMP(ip)
> +#  endif
> +
> +#define	IMM	#
> +
> +#define block(shift) \
> +	cmp	r0, r1, lsl IMM shift; \
> +	subhs	r0, r0, r1, lsl IMM shift
> +
> +	block(31)
> +	block(30)
> +	block(29)
> +	block(28)
> +	block(27)
> +	block(26)
> +	block(25)
> +	block(24)
> +	block(23)
> +	block(22)
> +	block(21)
> +	block(20)
> +	block(19)
> +	block(18)
> +	block(17)
> +	block(16)
> +	block(15)
> +	block(14)
> +	block(13)
> +	block(12)
> +	block(11)
> +	block(10)
> +	block(9)
> +	block(8)
> +	block(7)
> +	block(6)
> +	block(5)
> +	block(4)
> +	block(3)
> +	block(2)
> +	block(1)
> +LOCAL_LABEL(div0block):
> +	block(0)
> +	JMP(lr)
> +#endif /* __ARM_ARCH_EXT_IDIV__ */
> +
> +LOCAL_LABEL(divby0):
> +	mov	r0, #0
> +#ifdef __ARM_EABI__
> +	b	__aeabi_idiv0
> #else
> -//  We use a simple digit by digit algorithm; before we get into the actual 
> -//  divide loop, we must calculate the left-shift amount necessary to align
> -//  the MSB of the divisor with that of the dividend.
> -    clz     r2,     a
> -    tst     b,      b       // detect b == 0
> -    clz     r3,     b
> -    bxeq    lr              // return a if b == 0
> -    subs    i,      r3, r2
> -    bxlt    lr              // return a if MSB(a) < MSB(b)
> -
> -LOCAL_LABEL(mainLoop):
> -//  This loop basically implements the following:
> -//
> -//  do {
> -//      if (a >= b << i) {
> -//          a -= b << i;
> -//          if (a == 0) break;
> -//      }
> -//  } while (--i)
> -//
> -//  Note that this does not perform the final iteration (i == 0); by doing it
> -//  this way, we can merge the two branches which is a substantial win for
> -//  such a tight loop on current ARM architectures.
> -    subs    r,      a,  b, lsl i
> -    it hs
> -    movhs   a,      r
> -    it ne
> -    subsne  i,      i, #1
> -    bhi     LOCAL_LABEL(mainLoop)
> -
> -//  Do the final test subtraction and update of remainder (i == 0), as it is
> -//  not performed in the main loop.
> -    subs    r,      a,  b
> -    it hs
> -    movhs   a,      r
> -    bx      lr
> +	JMP(lr)
> #endif
> +
> +END_COMPILERRT_FUNCTION(__umodsi3)
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits