[compiler-rt] r213467 - ARM: fix division in some cases

Saleem Abdulrasool compnerd at compnerd.org
Sat Jul 19 21:44:21 PDT 2014


Author: compnerd
Date: Sat Jul 19 23:44:21 2014
New Revision: 213467

URL: http://llvm.org/viewvc/llvm-project?rev=213467&view=rev
Log:
ARM: fix division in some cases

For ARM cores that are ARMv6T2+ but not ARMv7ve or ARMv7-r and not an updated
ARMv7-a that has the idiv extension (chips with clz but not idiv), an incorrect
jump would be calculated due to the preference to thumb instructions over ARM.

Rather than computing the target at runtime, use a jumptable instead.  This
trades a bit of storage for performance.  The overhead is 32-bytes for each of
the three routines, but avoid the calculation of the offset.

Because clz was introduced in ARMv6T2 and idiv in certain versions of ARMv7,
the non-clz, non-idiv case implies a target which does not support Thumb-2, and
thus we cannot use Thumb on those targets (as it is unlikely that the assembly
will assemble).

Take the opportunity to refactor the IT block macros into assembly.h rather than
redefining them in the TUs where they are used.

Existing tests cover the full change already, so no new tests are added.

This effectively reverts SVN r213309.

Modified:
    compiler-rt/trunk/lib/builtins/arm/udivmodsi4.S
    compiler-rt/trunk/lib/builtins/arm/udivsi3.S
    compiler-rt/trunk/lib/builtins/arm/umodsi3.S
    compiler-rt/trunk/lib/builtins/assembly.h

Modified: compiler-rt/trunk/lib/builtins/arm/udivmodsi4.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/builtins/arm/udivmodsi4.S?rev=213467&r1=213466&r2=213467&view=diff
==============================================================================
--- compiler-rt/trunk/lib/builtins/arm/udivmodsi4.S (original)
+++ compiler-rt/trunk/lib/builtins/arm/udivmodsi4.S Sat Jul 19 23:44:21 2014
@@ -16,6 +16,9 @@
 
 	.syntax unified
 	.text
+#if __ARM_ARCH_ISA_THUMB == 2
+	.thumb
+#endif
 
 	.p2align 2
 DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
@@ -38,11 +41,15 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
 	 *
 	 * r0 is the numerator, r1 the denominator.
 	 *
+	 * ARM:
 	 * The code before JMP computes the correct shift I, so that
 	 * r0 and (r1 << I) have the highest bit set in the same position.
 	 * At the time of JMP, ip := .Ldiv0block - 12 * I.
 	 * This depends on the fixed instruction size of block.
 	 *
+	 * Thumb 2:
+	 * Uses a jumptable to jump to the appropriate block.
+	 *
 	 * block(shift) implements the test-and-update-quotient core.
 	 * It assumes (r0 << shift) can be computed without overflow and
 	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
@@ -52,17 +59,59 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
 	clz	ip, r0
 	clz	r3, r1
 	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+#if __ARM_ARCH_ISA_THUMB == 2
+	sub	ip, r3, ip
+	mov	r3, #0
+	tbb	[pc, ip]
+LOCAL_LABEL(JT):
+	.byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2
+#else
 	sub	r3, r3, ip
-	adr	ip, LOCAL_LABEL(div0block)
+	adr	ip, LOCAL_LABEL(0)
 	sub	ip, ip, r3, lsl #2
 	sub	ip, ip, r3, lsl #3
 	mov	r3, #0
 	bx	ip
+#endif
 #  else
+#if __ARM_ARCH_ISA_THUMB == 2
+#error unsupported configuration
+#endif
 	str	r4, [sp, #-8]!
 
 	mov	r4, r0
-	adr	ip, LOCAL_LABEL(div0block)
+	adr	ip, LOCAL_LABEL(0)
 
 	lsr	r3, r4, #16
 	cmp	r3, r1
@@ -96,9 +145,11 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
 
 #define	IMM	#
 
-#define block(shift) \
-	cmp	r0, r1, lsl IMM shift; \
-	addhs	r3, r3, IMM (1 << shift); \
+#define block(shift)                                                           \
+LOCAL_LABEL(shift):                                                            \
+	cmp	r0, r1, lsl IMM shift;                                         \
+	ITT hs;                                                                \
+	addhs	r3, r3, IMM (1 << shift);                                      \
 	subhs	r0, r0, r1, lsl IMM shift
 
 	block(31)
@@ -132,7 +183,6 @@ DEFINE_COMPILERRT_FUNCTION(__udivmodsi4)
 	block(3)
 	block(2)
 	block(1)
-LOCAL_LABEL(div0block):
 	block(0)
 
 	str	r0, [r2]

Modified: compiler-rt/trunk/lib/builtins/arm/udivsi3.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/builtins/arm/udivsi3.S?rev=213467&r1=213466&r2=213467&view=diff
==============================================================================
--- compiler-rt/trunk/lib/builtins/arm/udivsi3.S (original)
+++ compiler-rt/trunk/lib/builtins/arm/udivsi3.S Sat Jul 19 23:44:21 2014
@@ -16,6 +16,9 @@
 
 	.syntax unified
 	.text
+#if __ARM_ARCH_ISA_THUMB == 2
+	.thumb
+#endif
 
 	.p2align 2
 DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_uidiv, __udivsi3)
@@ -32,6 +35,7 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
 	bcc	LOCAL_LABEL(divby0)
 	JMPc(lr, eq)
 	cmp	r0, r1
+	IT cc
 	movcc	r0, #0
 	JMPc(lr, cc)
 	/*
@@ -39,11 +43,15 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
 	 *
 	 * r0 is the numerator, r1 the denominator.
 	 *
+	 * ARM:
 	 * The code before JMP computes the correct shift I, so that
 	 * r0 and (r1 << I) have the highest bit set in the same position.
 	 * At the time of JMP, ip := .Ldiv0block - 12 * I.
 	 * This depends on the fixed instruction size of block.
 	 *
+	 * Thumb 2:
+	 * Uses a jumptable to jump to the appropriate block.
+	 *
 	 * block(shift) implements the test-and-update-quotient core.
 	 * It assumes (r0 << shift) can be computed without overflow and
 	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
@@ -53,15 +61,57 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
 	clz	ip, r0
 	clz	r3, r1
 	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
+#if __ARM_ARCH_ISA_THUMB == 2
+	sub	ip, r3, ip
+	mov	r3, #0
+	tbb	[pc, ip]
+LOCAL_LABEL(JT):
+	.byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2
+#else
 	sub	r3, r3, ip
-	adr	ip, LOCAL_LABEL(div0block)
+	adr	ip, LOCAL_LABEL(0)
 	sub	ip, ip, r3, lsl #2
 	sub	ip, ip, r3, lsl #3
 	mov	r3, #0
 	bx	ip
+#endif
 #  else
+#if __ARM_ARCH_ISA_THUMB == 2
+#error unsupported configuration
+#endif
 	mov	r2, r0
-	adr	ip, LOCAL_LABEL(div0block)
+	adr	ip, LOCAL_LABEL(0)
 
 	lsr	r3, r2, #16
 	cmp	r3, r1
@@ -94,10 +144,12 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
 
 #define	IMM	#
 
-#define block(shift) \
-	cmp	r0, r1, lsl IMM shift; \
-	addhs	r3, r3, IMM (1 << shift); \
-	subhs	r0, r0, r1, lsl IMM shift
+#define block(shift)                                                           \
+LOCAL_LABEL(shift):                                                            \
+	cmp r0, r1, lsl IMM shift;                                             \
+	ITT hs;                                                                \
+	addhs r3, r3, IMM(1 << shift);                                         \
+	subhs r0, r0, r1, lsl IMM shift
 
 	block(31)
 	block(30)
@@ -130,7 +182,6 @@ DEFINE_COMPILERRT_FUNCTION(__udivsi3)
 	block(3)
 	block(2)
 	block(1)
-LOCAL_LABEL(div0block):
 	block(0)
 
 	mov	r0, r3

Modified: compiler-rt/trunk/lib/builtins/arm/umodsi3.S
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/builtins/arm/umodsi3.S?rev=213467&r1=213466&r2=213467&view=diff
==============================================================================
--- compiler-rt/trunk/lib/builtins/arm/umodsi3.S (original)
+++ compiler-rt/trunk/lib/builtins/arm/umodsi3.S Sat Jul 19 23:44:21 2014
@@ -16,6 +16,9 @@
 
 	.syntax unified
 	.text
+#if __ARM_ARCH_ISA_THUMB == 2
+	.thumb
+#endif
 
 	.p2align 2
 DEFINE_COMPILERRT_FUNCTION(__umodsi3)
@@ -30,6 +33,7 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
 #else
 	cmp	r1, #1
 	bcc	LOCAL_LABEL(divby0)
+	IT eq
 	moveq	r0, #0
 	JMPc(lr, eq)
 	cmp	r0, r1
@@ -39,11 +43,15 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
 	 *
 	 * r0 is the numerator, r1 the denominator.
 	 *
+	 * For ARM:
 	 * The code before JMP computes the correct shift I, so that
 	 * r0 and (r1 << I) have the highest bit set in the same position.
 	 * At the time of JMP, ip := .Ldiv0block - 8 * I.
 	 * This depends on the fixed instruction size of block.
 	 *
+	 * For Thumb:
+	 * Uses a jumptable to jump to the appropriate block.
+	 *
 	 * block(shift) implements the test-and-update-quotient core.
 	 * It assumes (r0 << shift) can be computed without overflow and
 	 * that (r0 << shift) < 2 * r1. The quotient is stored in r3.
@@ -54,12 +62,52 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
 	clz	r3, r1
 	/* r0 >= r1 implies clz(r0) <= clz(r1), so ip <= r3. */
 	sub	r3, r3, ip
-	adr	ip, LOCAL_LABEL(div0block)
+#if __ARM_ARCH_ISA_THUMB == 2
+	tbb	[pc, r3]
+LOCAL_LABEL(JT):
+	.byte (LOCAL_LABEL( 0) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 1) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 2) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 3) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 4) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 5) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 6) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 7) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 8) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL( 9) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(10) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(11) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(12) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(13) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(14) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(15) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(16) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(17) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(18) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(19) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(20) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(21) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(22) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(23) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(24) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(25) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(26) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(27) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(28) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(29) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(30) - LOCAL_LABEL(JT)) / 2
+	.byte (LOCAL_LABEL(31) - LOCAL_LABEL(JT)) / 2
+#else
+	adr	ip, LOCAL_LABEL(0)
 	sub	ip, ip, r3, lsl #3
 	bx	ip
+#endif
 #  else
+#if __ARM_ARCH_ISA_THUMB == 2
+#error unsupported configuration
+#endif
 	mov	r2, r0
-	adr	ip, LOCAL_LABEL(div0block)
+	adr	ip, LOCAL_LABEL(0)
 
 	lsr	r3, r2, #16
 	cmp	r3, r1
@@ -90,9 +138,11 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
 
 #define	IMM	#
 
-#define block(shift) \
-	cmp	r0, r1, lsl IMM shift; \
-	subhs	r0, r0, r1, lsl IMM shift
+#define block(shift)                                                           \
+LOCAL_LABEL(shift):                                                            \
+	cmp r0, r1, lsl IMM shift;                                             \
+	IT hs;                                                                 \
+	subhs r0, r0, r1, lsl IMM shift
 
 	block(31)
 	block(30)
@@ -125,7 +175,6 @@ DEFINE_COMPILERRT_FUNCTION(__umodsi3)
 	block(3)
 	block(2)
 	block(1)
-LOCAL_LABEL(div0block):
 	block(0)
 	JMP(lr)
 #endif /* __ARM_ARCH_EXT_IDIV__ */

Modified: compiler-rt/trunk/lib/builtins/assembly.h
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/builtins/assembly.h?rev=213467&r1=213466&r2=213467&view=diff
==============================================================================
--- compiler-rt/trunk/lib/builtins/assembly.h (original)
+++ compiler-rt/trunk/lib/builtins/assembly.h Sat Jul 19 23:44:21 2014
@@ -22,6 +22,16 @@
 #define SEPARATOR ;
 #endif
 
+#if defined(__arm__)
+#if __ARM_ARCH_ISA_THUMB == 2
+#define IT  it
+#define ITT itt
+#else
+#define IT  @
+#define ITT @
+#endif
+#endif
+
 #if defined(__APPLE__)
 #define HIDDEN(name) .private_extern name
 #define LOCAL_LABEL(name) L_##name
@@ -86,7 +96,9 @@
 
 #ifdef ARM_HAS_BX
 #define JMP(r) bx r
-#define JMPc(r, c) bx##c r
+#define JMPc(r, c)                                                             \
+  IT c;                                                                        \
+  bx##c r
 #else
 #define JMP(r) mov pc, r
 #define JMPc(r, c) mov##c pc, r





More information about the llvm-commits mailing list