[llvm-branch-commits] [compiler-rt] [compiler-rt][ARM] Optimized single precision FP add/sub (PR #179929)
Simon Tatham via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Feb 6 06:09:52 PST 2026
https://github.com/statham-arm updated https://github.com/llvm/llvm-project/pull/179929
>From d71c66686332436d87c857be79d0bedafe3cf206 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 29 Jan 2026 16:20:54 +0000
Subject: [PATCH 1/3] [compiler-rt][ARM] Optimized single precision FP add/sub
This replaces the previous Thumb1-specific addsf3 with both Thumb1 and
Arm/Thumb2 add/sub.
I've removed the old Thumb1 addsf3 completely, partly because this
implementation is expected to be faster, and partly because the new
tests exposed a bug in the old implementation. However the new
implementation does consume more code, so perhaps putting the old
implementation back as an alternative with the bug fixed might be a
useful option.
---
compiler-rt/lib/builtins/CMakeLists.txt | 8 +-
compiler-rt/lib/builtins/arm/addsf3.S | 967 ++++++++++++++-----
compiler-rt/lib/builtins/arm/thumb1/addsf3.S | 888 +++++++++++++++++
compiler-rt/test/builtins/Unit/addsf3_test.c | 384 ++++++++
compiler-rt/test/builtins/Unit/subsf3_test.c | 382 ++++++++
5 files changed, 2368 insertions(+), 261 deletions(-)
create mode 100644 compiler-rt/lib/builtins/arm/thumb1/addsf3.S
create mode 100644 compiler-rt/test/builtins/Unit/addsf3_test.c
create mode 100644 compiler-rt/test/builtins/Unit/subsf3_test.c
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index ed6a9d00db069..ac7396f6ba481 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -445,6 +445,7 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
if(implicit_it_flag)
set(assembly_files
+ arm/addsf3.S
arm/mulsf3.S
arm/divsf3.S
arm/adddf3.S
@@ -518,15 +519,18 @@ set(thumb1_base_SOURCES
arm/divsi3.S
arm/udivsi3.S
arm/comparesf2.S
- arm/addsf3.S
${GENERIC_SOURCES}
)
+# arm/addsf3.S implements both addition and subtraction via cross-branching
+set_property(SOURCE arm/addsf3.S PROPERTY crt_supersedes subsf3.c)
+set_property(SOURCE arm/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
set_property(SOURCE arm/adddf3.S PROPERTY crt_supersedes subdf3.c)
set_property(SOURCE arm/adddf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subdf3)
if(COMPILER_RT_ARM_OPTIMIZED_FP)
set(thumb1_base_SOURCES
arm/thumb1/mulsf3.S
+ arm/thumb1/addsf3.S
arm/thumb1/cmpdf2.S
arm/thumb1/cmpsf2.S
arm/thumb1/gedf2.S
@@ -538,6 +542,8 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP)
arm/funder.c
${thumb1_base_SOURCES}
)
+ set_property(SOURCE arm/thumb1/addsf3.S PROPERTY crt_supersedes subsf3.c)
+ set_property(SOURCE arm/thumb1/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
set_property(SOURCE arm/thumb1/cmpdf2.S PROPERTY crt_supersedes comparedf2.c)
set_property(SOURCE arm/thumb1/cmpdf2.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides comparedf2)
set_property(SOURCE arm/thumb1/cmpsf2.S PROPERTY crt_supersedes comparesf2.S)
diff --git a/compiler-rt/lib/builtins/arm/addsf3.S b/compiler-rt/lib/builtins/arm/addsf3.S
index 7b7cf85922753..7e2daff571a31 100644
--- a/compiler-rt/lib/builtins/arm/addsf3.S
+++ b/compiler-rt/lib/builtins/arm/addsf3.S
@@ -1,4 +1,4 @@
-//===-- addsf3.S - Adds two single precision floating pointer numbers-----===//
+//===-- addsf3.S - Add/subtract single precision floating point numbers ---===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -6,271 +6,718 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements the __addsf3 (single precision floating pointer number
-// addition with the IEEE-754 default rounding (to nearest, ties to even)
-// function for the ARM Thumb1 ISA.
+// This file implements the __addsf3 and __subsf3 functions (single precision
+// floating point number addition and subtraction), with the IEEE-754 default
+// rounding (to nearest, ties to even), for the Arm and Thumb2 ISAs.
//
//===----------------------------------------------------------------------===//
#include "../assembly.h"
-#define significandBits 23
-#define typeWidth 32
- .syntax unified
- .text
- .thumb
+ .syntax unified
+ .text
.p2align 2
-DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fadd, __addsf3)
-
-DEFINE_COMPILERRT_THUMB_FUNCTION(__addsf3)
- push {r4, r5, r6, r7, lr}
- // Get the absolute value of a and b.
- lsls r2, r0, #1
- lsls r3, r1, #1
- lsrs r2, r2, #1 // aAbs
- beq LOCAL_LABEL(a_zero_nan_inf)
- lsrs r3, r3, #1 // bAbs
- beq LOCAL_LABEL(zero_nan_inf)
-
- // Detect if a or b is infinity or Nan.
- lsrs r6, r2, #(significandBits)
- lsrs r7, r3, #(significandBits)
- cmp r6, #0xFF
- beq LOCAL_LABEL(zero_nan_inf)
- cmp r7, #0xFF
- beq LOCAL_LABEL(zero_nan_inf)
-
- // Swap Rep and Abs so that a and aAbs has the larger absolute value.
- cmp r2, r3
- bhs LOCAL_LABEL(no_swap)
- movs r4, r0
- movs r5, r2
- movs r0, r1
- movs r2, r3
- movs r1, r4
- movs r3, r5
-LOCAL_LABEL(no_swap):
-
- // Get the significands and shift them to give us round, guard and sticky.
- lsls r4, r0, #(typeWidth - significandBits)
- lsrs r4, r4, #(typeWidth - significandBits - 3) // aSignificand << 3
- lsls r5, r1, #(typeWidth - significandBits)
- lsrs r5, r5, #(typeWidth - significandBits - 3) // bSignificand << 3
-
- // Get the implicitBit.
- movs r6, #1
- lsls r6, r6, #(significandBits + 3)
-
- // Get aExponent and set implicit bit if necessary.
- lsrs r2, r2, #(significandBits)
- beq LOCAL_LABEL(a_done_implicit_bit)
- orrs r4, r6
-LOCAL_LABEL(a_done_implicit_bit):
-
- // Get bExponent and set implicit bit if necessary.
- lsrs r3, r3, #(significandBits)
- beq LOCAL_LABEL(b_done_implicit_bit)
- orrs r5, r6
-LOCAL_LABEL(b_done_implicit_bit):
-
- // Get the difference in exponents.
- subs r6, r2, r3
- beq LOCAL_LABEL(done_align)
-
- // If b is denormal, then a must be normal as align > 0, and we only need to
- // right shift bSignificand by (align - 1) bits.
- cmp r3, #0
- bne 1f
- subs r6, r6, #1
-1:
-
- // No longer needs bExponent. r3 is dead here.
- // Set sticky bits of b: sticky = bSignificand << (typeWidth - align).
- movs r3, #(typeWidth)
- subs r3, r3, r6
- movs r7, r5
- lsls r7, r3
- beq 1f
- movs r7, #1
-1:
-
- // bSignificand = bSignificand >> align | sticky;
- lsrs r5, r6
- orrs r5, r7
- bne LOCAL_LABEL(done_align)
- movs r5, #1 // sticky; b is known to be non-zero.
-
-LOCAL_LABEL(done_align):
- // isSubtraction = (aRep ^ bRep) >> 31;
- movs r7, r0
- eors r7, r1
- lsrs r7, #31
- bne LOCAL_LABEL(do_substraction)
-
- // Same sign, do Addition.
-
- // aSignificand += bSignificand;
- adds r4, r4, r5
-
- // Check carry bit.
- movs r6, #1
- lsls r6, r6, #(significandBits + 3 + 1)
- movs r7, r4
- ands r7, r6
- beq LOCAL_LABEL(form_result)
- // If the addition carried up, we need to right-shift the result and
- // adjust the exponent.
- movs r7, r4
- movs r6, #1
- ands r7, r6 // sticky = aSignificand & 1;
- lsrs r4, #1
- orrs r4, r7 // result Significand
- adds r2, #1 // result Exponent
- // If we have overflowed the type, return +/- infinity.
- cmp r2, 0xFF
- beq LOCAL_LABEL(ret_inf)
-
-LOCAL_LABEL(form_result):
- // Shift the sign, exponent and significand into place.
- lsrs r0, #(typeWidth - 1)
- lsls r0, #(typeWidth - 1) // Get Sign.
- lsls r2, #(significandBits)
- orrs r0, r2
- movs r1, r4
- lsls r4, #(typeWidth - significandBits - 3)
- lsrs r4, #(typeWidth - significandBits)
- orrs r0, r4
-
- // Final rounding. The result may overflow to infinity, but that is the
- // correct result in that case.
- // roundGuardSticky = aSignificand & 0x7;
- movs r2, #0x7
- ands r1, r2
- // if (roundGuardSticky > 0x4) result++;
-
- cmp r1, #0x4
- blt LOCAL_LABEL(done_round)
- beq 1f
- adds r0, #1
- pop {r4, r5, r6, r7, pc}
-1:
-
- // if (roundGuardSticky == 0x4) result += result & 1;
- movs r1, r0
- lsrs r1, #1
- bcc LOCAL_LABEL(done_round)
- adds r0, r0, #1
-LOCAL_LABEL(done_round):
- pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(do_substraction):
- subs r4, r4, r5 // aSignificand -= bSignificand;
- beq LOCAL_LABEL(ret_zero)
- movs r6, r4
- cmp r2, 0
- beq LOCAL_LABEL(form_result) // if a's exp is 0, no need to normalize.
- // If partial cancellation occurred, we need to left-shift the result
- // and adjust the exponent:
- lsrs r6, r6, #(significandBits + 3)
- bne LOCAL_LABEL(form_result)
-
- push {r0, r1, r2, r3}
- movs r0, r4
- bl SYMBOL_NAME(__clzsi2)
- movs r5, r0
- pop {r0, r1, r2, r3}
- // shift = rep_clz(aSignificand) - rep_clz(implicitBit << 3);
- subs r5, r5, #(typeWidth - significandBits - 3 - 1)
- // aSignificand <<= shift; aExponent -= shift;
- lsls r4, r5
- subs r2, r2, r5
- bgt LOCAL_LABEL(form_result)
-
- // Do normalization if aExponent <= 0.
- movs r6, #1
- subs r6, r6, r2 // 1 - aExponent;
- movs r2, #0 // aExponent = 0;
- movs r3, #(typeWidth) // bExponent is dead.
- subs r3, r3, r6
- movs r7, r4
- lsls r7, r3 // stickyBit = (bool)(aSignificant << (typeWidth - align))
- beq 1f
- movs r7, #1
-1:
- lsrs r4, r6 // aSignificand >> shift
- orrs r4, r7
- b LOCAL_LABEL(form_result)
-
-LOCAL_LABEL(ret_zero):
- movs r0, #0
- pop {r4, r5, r6, r7, pc}
-
-
-LOCAL_LABEL(a_zero_nan_inf):
- lsrs r3, r3, #1
-
-LOCAL_LABEL(zero_nan_inf):
- // Here r2 has aAbs, r3 has bAbs
- movs r4, #0xFF
- lsls r4, r4, #(significandBits) // Make +inf.
-
- cmp r2, r4
- bhi LOCAL_LABEL(a_is_nan)
- cmp r3, r4
- bhi LOCAL_LABEL(b_is_nan)
-
- cmp r2, r4
- bne LOCAL_LABEL(a_is_rational)
- // aAbs is INF.
- eors r1, r0 // aRep ^ bRep.
- movs r6, #1
- lsls r6, r6, #(typeWidth - 1) // get sign mask.
- cmp r1, r6 // if they only differ on sign bit, it's -INF + INF
- beq LOCAL_LABEL(a_is_nan)
- pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(a_is_rational):
- cmp r3, r4
- bne LOCAL_LABEL(b_is_rational)
- movs r0, r1
- pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(b_is_rational):
- // either a or b or both are zero.
- adds r4, r2, r3
- beq LOCAL_LABEL(both_zero)
- cmp r2, #0 // is absA 0 ?
- beq LOCAL_LABEL(ret_b)
- pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(both_zero):
- ands r0, r1 // +0 + -0 = +0
- pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(ret_b):
- movs r0, r1
-
-LOCAL_LABEL(ret):
- pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(b_is_nan):
- movs r0, r1
-LOCAL_LABEL(a_is_nan):
- movs r1, #1
- lsls r1, r1, #(significandBits -1) // r1 is quiet bit.
- orrs r0, r1
- pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(ret_inf):
- movs r4, #0xFF
- lsls r4, r4, #(significandBits)
- orrs r0, r4
- lsrs r0, r0, #(significandBits)
- lsls r0, r0, #(significandBits)
- pop {r4, r5, r6, r7, pc}
-
-
-END_COMPILERRT_FUNCTION(__addsf3)
+// General structure of this code:
+//
+// There are three actual entry points here, for addition, subtraction and
+// reversed subtraction (just taking the operands the other way round, so that
+// it returns y-x instead of x-y). But the first thing the functions do (after
+// checking for NaNs) is to sort out whether the magnitudes of the two inputs
+// are being added (x+y with like signs, or x-y with different signs), or
+// subtracted. So fadd jumps across into the middle of fsub if it sees that the
+// signs are different, and vice versa. Then the main code path in fadd handles
+// magnitude addition, and the one in fsub handles magnitude subtraction.
+//
+// NaNs are checked first, so that an input NaN can be propagated exactly,
+// including its sign bit. After ruling out that case, it's safe to flip the
+// sign of one of the inputs, so that during the cross-calls, x - y can be
+// rewritten as x + (-y) and vice versa.
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__addsf3)
+ push {r4, lr}
+ vmov r0, s0
+ vmov r1, s1
+ bl __aeabi_fadd
+ vmov s0, r0
+ pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__addsf3, __aeabi_fadd)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_fadd)
+ // Test for all uncommon values at once: infinities, NaNs, denormals and
+ // zeroes. Branch out of line if any are found. We do this by XORing each
+ // input with itself shifted left by a bit, which means that exponents 00 and
+ // FF will both end up with seven zero bits at the top.
+ eor r2, r0, r0, lsl #1 // combine x with itself shifted
+ eor r3, r1, r1, lsl #1 // same for y
+ tst r2, #0x7F000000 // is x uncommon?
+ tstne r3, #0x7F000000 // if not, is y uncommon?
+ beq LOCAL_LABEL(add_uncommon) // if either, branch out of line
+
+ // Now we have two normalised numbers. If their signs are opposite, we should
+ // be subtracting their magnitudes rather than adding, so cross-jump to fsub.
+ teq r0, r1 // set N if signs are unequal
+ eormi r1, r1, #1 << 31 // if so, flip the sign of y
+ bmi LOCAL_LABEL(sub_magnitude) // and go to magnitude subtraction
+LOCAL_LABEL(add_magnitude):
+ // If we get here, we're adding operands with equal signs (i.e. a magnitude
+ // addition). First thing to do is put the operands in magnitude order, so
+ // that x >= y.
+ subs r2, r0, r1 // compare inputs, also keeping x-y
+ sublo r0, r0, r2 // if x<y then turn x into y, using value in r2
+ addlo r1, r1, r2 // and similarly turn y into x
+
+ // Keep the sign and exponent of the larger input, to use as the sign and
+ // exponent of the output (up to carries and overflows). Also calculate the
+ // exponent difference, which tells us how far we'll need to shift y's
+ // mantissa right to add it to x's.
+ //
+ // The shifted-right values will include the sign bits as well as the
+ // exponents, but that's OK, in this branch the two sign bits are the same,
+ // so they'll cancel when subtracted.
+ //
+ // The exponent difference can be as large as 0xFE (maximum exponent minus
+ // minimum), which still fits in 8 bits, so shifting right by that amount is
+ // well defined in AArch32.
+ mov r2, r0, lsr #23
+ sub r3, r2, r1, lsr #23
+
+ // Extract both mantissas, moved up to the top of the word, with the leading
+ // 1 made explicit.
+ mov r12, #1 << 31 // the leading 1 by itself
+ orr r0, r12, r0, lsl #8
+ orr r1, r12, r1, lsl #8
+
+LOCAL_LABEL(add_doadd):
+ // Here we perform the actual addition. We either fell through from the code
+ // above, or jumped back to here after handling an input denormal.
+ //
+ // We get here with:
+ // Operands known to be numeric rather than zero/infinity/NaN;
+ // r0 = mantissa of larger operand (in high 24 bits);
+ // r1 = mantissa of smaller operand (in high 24 bits);
+ // r2 = result sign and exponent (in low 9 bits);
+ // r3 = exponent difference.
+ //
+ // For normal inputs, the mantissa registers (r0,r1) will have the top bit
+ // set. Denormals will leave that bit clear, treating the number as
+ // 0.[mantissa] x 2^(fixed exponent) instead of renormalising to 1.[mantissa]
+ // x 2^(variable exponent) as a multiplication would want.
+
+ // Actually shift the smaller mantissa downwards and add them together.
+#if !__thumb__
+ adds r12, r0, r1, lsr r3 // CS if x >= 2.0
+#else
+ // Thumb can't fold a register-controlled shift into an add, so we must use
+ // two separate instructions.
+ lsr r12, r1, r3
+ adds r12, r0, r12
+#endif
+
+ // If that addition carried off the top of r12, then the number has increased
+ // its exponent. Diverge into a completely separate code path for that case,
+ // because there we must check for overflow.
+ bcs LOCAL_LABEL(add_carry)
+
+ // Here, on the non-carrying path, we don't need to check for overflow at
+ // all. If there is an overflow it can only be due to rounding up, so the
+ // overflowed mantissa will be all zeroes, so the naively generated output
+ // will look like the correct infinity anyway.
+ //
+ // We shift the mantissa down to its final position, and recombine it with
+ // the sign + exponent (in r2) via addition. We keep the bit shifted off the
+ // bottom of the mantissa in C, and then use ADC for the recombination, which
+ // causes us to round up if that bit was set without needing an extra
+ // instruction. But the leading bit of the mantissa increments the exponent
+ // field unwantedly, so we must decrement r2 first to compensate for that.
+ sub r2, r2, #1
+ movs r0, r12, lsr #8
+ adc r0, r0, r2, lsl #23
+
+ // If we _didn't_ round up, then we're done.
+ bxcc lr
+
+ // But if we did round up, then we must also check if we need to round to
+ // even. This occurs if all the bits of y's mantissa shifted off the bottom
+ // are zero except for the round bit.
+ //
+ // Some of those bits are in r12 (the 32-bit version of the sum's mantissa).
+ // It's cheap to check those, and should exclude _most_ cases where
+ // round-to-even isn't needed.
+ tst r12, #127
+ bxne lr
+
+ // Failing that, we have to go back to the original mantissa of y (still in
+ // r1) and work out exactly how many bits of it to check.
+ rsb r3, r3, #32 // opposite of the amount we shifted y right by
+ lsls r1, r1, r3 // shift y left by that amount instead
+
+ // Now if Z is set, we do round to even, which works by just clearing the low
+ // bit of the output mantissa. This undoes the round-up if we rounded up to
+ // an odd mantissa, and otherwise, makes no difference.
+ biceq r0, r0, #1
+
+ // And now we're done.
+ bx lr
+
+LOCAL_LABEL(add_carry):
+ // This is the separate code path in which adding the mantissas together
+ // caused a carry off the top of the word, so that the exponent of the output
+ // incremented (even before rounding). Start by shifting the carry bit back
+ // in.
+ rrx r0, r12
+
+ // Now recombine the sign and exponent, and do the basic rounding (apart from
+ // round to even), in the same way as the non-carrying code path above.
+ // However this time we don't decrement r2, because we want our exponent to
+ // come out bigger by 1 than in the other code path.
+ movs r0, r0, lsr #8 // shift mantissa down to the right position
+ adc r0, r0, r2, lsl #23 // recombine with sign+exponent, and round
+
+ // Note that the mantissa cannot have overflowed during rounding: if it has
+ // all bits 1 before rounding, both operands must also have had all mantissa
+ // bits 1, and the same exponent - which implies the round bit was 0.
+ //
+ // So we definitely have the correct output exponent. There are two problems
+ // left: we might need to round to even, and we might have overflowed.
+
+ // First, do the cheap check that _usually_ rules out round-to-even. We only
+ // do this if C is set (i.e. if we rounded up), and we end up with Z=0 if no
+ // RTE. This relies on also having Z=0 already, in the case where we _didn't_
+ // round up - and that must be true because the last time we set the flags it
+ // was by shifting down the output mantissa, and that will always have had
+ // its leading bit set.
+ tstcs r12, #255 // test one more bit than on the no-carry path
+
+ // Now if Z=1 then we need to do the full check for RTE. But first, prepare a
+ // version of the output value shifted left by 1 where it's convenient to
+ // check its exponent for overflow. (We couldn't do that until we'd finished
+ // with r12 by testing it in the previous instruction.)
+ mov r12, r0, lsl #1
+
+ // Now, if we need to check for RTE, go off and do it.
+ beq LOCAL_LABEL(add_roundeven_ovf)
+
+ // Otherwise, we still need to check for overflow.
+ cmp r12, #0xff000000 // if r12 >= this, the exponent has overflowed
+ bxlo lr // so if not, we can leave
+ b LOCAL_LABEL(add_ovf) // but if so, go and handle overflow
+
+LOCAL_LABEL(add_roundeven_ovf):
+ // We came here if we detected a need to do the full check for RTE. But we
+ // may _also_ have overflowed, and just not have noticed yet.
+
+ // Same round-to-even check as in the non-carry case above.
+ rsb r3, r3, #32 // opposite of the amount we shifted y right by
+ lsls r1, r1, r3 // shift y left by that amount instead
+ biceq r0, r0, #1 // and if the remaining bits are all 0, round to even
+
+ // Now check for overflow, and if none, we're done.
+ cmp r12, #0xff000000 // if r12 >= this, the exponent has overflowed
+ bxlo lr // so if not, we can leave
+
+ // If we get here, we have definitely overflowed. Moreover, the exponent
+ // field of the number is exactly 0xff. So all we have to do is clear the
+ // mantissa, to make it into an infinity of the output sign.
+LOCAL_LABEL(add_ovf):
+ bfc r0, #0, #23
+ bx lr
+
+LOCAL_LABEL(add_uncommon):
+ // We come here if the entry-point check says that at least one of x and y
+ // has an uncommon (FF or 00) exponent. So we have at least one NaN,
+ // infinity, denormal or zero, but we don't know which, or which operand it's
+ // in. And we could have any combination of those types of input, in _both_
+ // operands.
+
+ // Detect FF exponents (NaNs or infinities) and branch again for those.
+ mov r12, #0xFF000000
+ bics r2, r12, r0, lsl #1
+ bicsne r2, r12, r1, lsl #1
+ beq LOCAL_LABEL(add_naninf)
+
+ // Now we know both inputs are finite, but there may be denormals or zeroes.
+ // So it's safe to do the same sign check and cross-jump as we did on the
+ // fast path.
+ teq r0, r1 // opposite signs?
+ eormi r1, r1, #1 << 31 // if so, negate the second operand
+ bmi LOCAL_LABEL(sub_zerodenorm) // and cross-jump to the fsub version of this code
+LOCAL_LABEL(add_zerodenorm):
+ // Now we know x and y have the same sign, and at least one of them is zero
+ // or denormal. If there aren't any zeroes, we'll end up rejoining the fast
+ // path, so we must set up all the same registers, and do our checks for zero
+ // in line with that.
+ //
+ // Start by exactly repeating the initial fast-path setup code: sort into
+ // magnitude order, get the output sign+exponent and the exponent shift.
+ subs r2, r0, r1 // compare inputs, also keeping x-y
+ sublo r0, r0, r2 // if x<y then turn x into y, using value in r2
+ addlo r1, r1, r2 // and similarly turn y into x
+ mov r2, r0, lsr #23 // get exponent of x (the sign bit will cancel)
+ sub r3, r2, r1, lsr #23 // subtract exponent of y to get shift count
+
+ // Shift y's mantissa up to the top of r1. We know y has exponent 0 (at least
+ // one of the inputs does, and we've sorted them by now). So we definitely
+ // don't need to set the leading bit on y's mantissa; also, if r1 becomes
+ // zero, then we know we have an addition to 0, and otherwise, we know both
+ // inputs are nonzero.
+ movs r1, r1, lsl #8 // is y zero?
+ bxeq lr // if so, just return x
+
+ // Now we know there aren't any zeroes, and that y is a denormal. x might or
+ // might not be a denormal, so we must check that and decide whether to set
+ // its top mantissa bit.
+ mov r0, r0, lsl #8 // shift mantissa of x to the top of r0
+ tst r2, #255 // is x's exponent 0? If so, it's denormal
+ orrne r0, r0, #1 << 31 // if not, set leading bit of x,
+ subne r3, r3, #1 // adjust exponent difference,
+ bne LOCAL_LABEL(add_doadd) // and go back to mainstream
+
+ // If both operands are denormals, addition becomes trivial: denormals and
+ // the smallest exponent of normalised numbers both multiply the mantissa by
+ // the same power of 2, so we can just add the mantissas together and put the
+ // output sign back on.
+ add r0, r0, r1 // make the output mantissa
+ mov r0, r0, lsr #8 // shift it into position
+ orr r0, r0, r2, lsl #23 // put the sign back at the top
+ bx lr // done!
+
+LOCAL_LABEL(add_naninf):
+ // We come here if at least one input is a NaN or infinity. If either or both
+ // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+ // the input.
+ mov r12, #0xFF000000
+ cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN
+ blo SYMBOL_NAME(__compiler_rt_fnan2)
+ cmp r12, r1, lsl #1
+ blo SYMBOL_NAME(__compiler_rt_fnan2)
+
+LOCAL_LABEL(add_inf):
+ // No NaNs, so we have at least one infinity. Almost all additions involving
+ // an infinity return the input infinity unchanged. The only exception is if
+ // there are two infinities that have opposite signs (which can happen even
+ // inf fadd, since on this code path we haven't cross-jumped into fsub),
+ // where we return NaN.
+ eor r2, r0, r1 // see how the two inputs differ
+ cmp r2, #0x80000000 // +inf + -inf?
+ subeq r0, r2, #0x00400000 // if so, make the default output QNaN
+ bxeq lr // and return it
+ cmp r12, r0, lsl #1 // otherwise, is r0 the infinity?
+ movne r0, r1 // no, so it's r1
+ bx lr // return the infinite input unchanged
+
+END_COMPILERRT_FUNCTION(__aeabi_fadd)
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_frsub)
+ // Reversed subtraction, that is, compute y-x, where x is in r0 and y in r1.
+ //
+ // We could implement this by simply swapping r0 with r1. But the point of
+ // having a reversed-subtract in the first place is to avoid the caller
+ // having to do that, so if we do it ourselves, it wastes all the time they
+ // saved. So instead, on the fast path, we redo the sign check our own way
+ // and branch to fadd_magnitude or fsub_magnitude.
+
+ // First rule out denormals and zeroes, using the same test as fadd and fsub.
+ eor r2, r0, r0, lsl #1
+ eor r3, r1, r1, lsl #1
+ tst r2, #0x7F000000
+ tstne r3, #0x7F000000
+ beq LOCAL_LABEL(rsb_uncommon)
+
+ // Now we know we only have finite inputs, it's safe to implement the
+ // reversal of the operand order by flipping signs. (Preserving the sign of
+ // an input NaN was the only case where that wasn't right.)
+
+ eor r0, r0, #1 << 31 // flip sign of the operand we're subtracting
+ teq r0, r1 // are the signs now the same?
+ bpl LOCAL_LABEL(add_magnitude) // if so, we're doing magnitude addition
+ eor r1, r1, #1 << 31 // otherwise, flip the other sign too
+ b LOCAL_LABEL(sub_magnitude) // and we're doing magnitude subtraction
+
+LOCAL_LABEL(rsb_uncommon):
+ // Any uncommon operands to frsub are handled by just swapping the two
+ // operands and going to fsub's handler. We're off the main fast path now, so
+ // there's no need to try to optimise it any harder.
+ eor r0, r0, r1
+ eor r1, r1, r0
+ eor r0, r0, r1
+ b LOCAL_LABEL(sub_uncommon)
+
+END_COMPILERRT_FUNCTION(__aeabi_frsub)
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__subsf3)
+ push {r4, lr}
+ vmov r0, s0
+ vmov r1, s1
+ bl __aeabi_fsub
+ vmov s0, r0
+ pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__subsf3, __aeabi_fsub)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_fsub)
+ // Main entry point for subtraction.
+ //
+ // Start by testing for uncommon operands in the usual way.
+ eor r2, r0, r0, lsl #1
+ eor r3, r1, r1, lsl #1
+ tst r2, #0x7F000000
+ tstne r3, #0x7F000000
+ beq LOCAL_LABEL(sub_uncommon)
+
+ // Check the signs, and if they're unequal, cross-jump into fadd to do
+ // magnitude addition. (Now we've excluded NaNs, it's safe to flip the sign
+ // of y.)
+ teq r0, r1
+ eormi r1, r1, #1 << 31
+ bmi LOCAL_LABEL(add_magnitude)
+
+LOCAL_LABEL(sub_magnitude):
+ // If we get here, we're subtracting operands with equal signs (i.e. a
+ // magnitude subtraction). First thing to do is put operands in magnitude
+ // order, so that x >= y. However, if they are swapped, we must also negate
+ // both of them, since x - y = (-y) - (-x).
+ subs r2, r0, r1 // LO if we must swap the operands
+#if !__thumb__
+ // Conditional on LO, swap the operands, by adding/subtracting the difference
+ // between them that we just wrote into r2. Negate them both in the process
+ // by flipping the high bit of r2 first.
+ eorlo r2, r2, #1 << 31
+ sublo r0, r0, r2
+ addlo r1, r1, r2
+#else
+ // In Thumb, conditionally branch round these three instructions, instead of
+ // conditionally executing them with an ITTT LO. Rationale: on the simpler
+ // Thumb-only cores such as Cortex-M3, a branch only takes two cycles and an
+ // IT costs one, so this saves two cycles in the untaken case and doesn't
+ // impact the taken case at all.
+ bhs 0f
+ eor r2, r2, #1 << 31
+ sub r0, r0, r2
+ add r1, r1, r2
+0:
+#endif
+
+ // Save the sign and exponent of the larger operand to use for the result (up
+ // to renormalisation), and calculate the exponent difference for shifting
+ // one mantissa relative to the other.
+ mov r2, r0, lsr #23 // r2 = sign<<8 + exponent
+ sub r3, r2, r1, lsr #23 // shift = 0..254 (sign bits cancel)
+
+ // Shift the mantissas up to the top of the words, and OR in the leading 1
+ // for each.
+ mov r12, #1 << 31
+ orr r0, r12, r0, lsl #8
+ orr r1, r12, r1, lsl #8
+
+LOCAL_LABEL(sub_dosub):
+ // Here we perform the actual subtraction. We either fell through from the
+ // code above, or jumped back to here after handling an input denormal.
+ //
+ // We get here with:
+ // Operands known to be numeric rather than zero/infinity/NaN;
+ // r0 = mantissa of larger operand (in high 24 bits);
+ // r1 = mantissa of smaller operand (in high 24 bits);
+ // r2 = result sign/exponent (in low 9 bits)
+ // r3 = exponent difference.
+ //
+ // Begin calculating the output mantissa by shifting y's mantissa right and
+ // subtracting. This may leave the mantissa too large by one, if the bits
+ // shifted out of y are nonzero. We correct this during rounding if
+ // necessary.
+#if !__thumb__
+ subs r12, r0, r1, lsr r3 // MI if high bit set
+#else
+ // Thumb can't fold a register-controlled shift into a sub, so we must use
+ // two separate instructions.
+ lsr r12, r1, r3
+ subs r12, r0, r12
+#endif
+
+ // This may have cleared the high bit of the output mantissa, in which case
+ // we must renormalise. Our strategy is to split into three code paths, on
+ // two of which an awkward case is known not to arise:
+ // * no need to renormalise at all => underflow can't happen
+ // * shift up by exactly 1 bit
+ // * shift up by more than 1 bit => rounding can't happen (result is exact)
+ //
+ // First branch out of line for the first case, which we can detect because
+ // the N flag tells us whether the top mantissa bit is still set.
+ bmi LOCAL_LABEL(sub_renorm_0)
+
+ // Now we know we're renormalising by at least one bit, which also means
+ // underflow is a risk.
+ //
+ // If we're shifting by only one bit, then underflow can only occur if the
+ // exponent was originally 1. So test both those conditions together, and if
+ // the shift is only one bit _and_ the exponent is > 1, we know we can
+ // renormalise by one bit and not worry about underflow.
+ tst r2, #254 // test all but low bit of exponent; also clears N
+#if !__thumb__
+ movsne r0, r12, lsl #1 // set N if non-underflowing _and_ top bit now set
+#else
+ // In Thumb, there's no advantage in combining the two tests, since the IT
+ // between them costs a cycle. Do the explicit branch now to fsub_underflow
+ // (because now we _know_ we have underflow).
+ beq LOCAL_LABEL(sub_underflow)
+ // And then unconditionally do the shift.
+ movs r0, r12, lsl #1 // check whether 2nd bit is cleared (PL)
+#endif
+ // After all that, N is clear if we still haven't set the top mantissa bit,
+ // either because we shifted up by a bit and it didn't help, or (in Arm state
+ // only) because we detected underflow and didn't do the shift at all.
+ //
+ // The case of 'haven't yet done the shift' is reliably indicated by the Z
+ // flag being set, because if we did do the shift, it will always have
+ // cleared Z.
+ bpl LOCAL_LABEL(sub_renorm_orunder)
+
+ // If we get here, we've renormalised by one bit (and have already shifted
+ // the mantissa up), and we also know there's no underflow.
+ //
+ // Recombine the sign+exponent with the fraction. We must also decrement the
+ // exponent, to account for the one-bit renormalisation. We do that by using
+ // ASR to shift the mantissa right: its top bit is currently set, so the ASR
+ // effectively puts -1 in the bits that are being added to the exponent.
+ movs r0, r0, asr #8 // also sets C if we need to round up
+ adc r0, r0, r2, lsl #23 // recombine, and also do basic rounding
+
+ // If C was not set, then we've rounded down. Therefore, no need to round to
+ // even, and also, no need to compensate for having shifted nonzero bits out
+ // of the subtrahend. We can just return.
+ bxcc lr
+
+ // If any bit shifted out of the 32-bit output mantissa is nonzero, then we
+ // can also return, because we know we're rounding _up_ (and not to even),
+ // and again, bits shifted out of the subtrahend don't matter because their
+ // combined loss can't exceed the gain from one of these guard bits.
+ tst r12, #0x3F
+ bxne lr
+
+ // Otherwise, we must do the full check for round to even.
+ b LOCAL_LABEL(sub_roundeven)
+
+LOCAL_LABEL(sub_renorm_0):
+ // We come here if no renormalisation is necessary, and therefore also no
+ // underflow can happen.
+ //
+ // Since the leading bit is set, we need to decrement the exponent, to
+ // account for the leading bit adding 1 to it when we recombine.
+ movs r0, r12, lsr #8 // also sets C if we need to round up
+ sub r2, r2, #1 // adjust exponent
+ adc r0, r0, r2, lsl #23 // recombine, and also do basic rounding
+
+ // As in the 1-bit case above, if we didn't round up just now then we're
+ // done, and if any bit shifted out of r12 just now was nonzero then we're
+ // also done.
+ bxcc lr // rounding down, done
+ tst r12, #0x7F
+ bxne lr // nonzero guard bit, rounding up, done
+
+ // Otherwise, fall through to the full check for round to even.
+LOCAL_LABEL(sub_roundeven):
+ // Same round-to-even check as in the fadd cases: find all the bits we
+ // shifted out of y's mantissa and see if any are zero.
+ rsb r3, r3, #32
+ lsls r1, r1, r3 // set Z if we're rounding to even
+
+ // Unlike the addition case, if we aren't rounding to even then the result is
+ // currently too _big_: the top 32 bits of the output mantissa looked as if
+ // they were on a rounding boundary, but those nonzero bits shifted off the
+ // bottom of the mantissa make the true value slightly smaller than it
+ // looked, so in fact we're just _below_ a rounding boundary. But we've
+ // already rounded it up! So in the non-RTE case we must decrement the
+ // output value.
+ subne r0, r0, #1 // no RTE, so undo round up
+ biceq r0, r0, #1 // yes RTE, so clear low bit of output
+ bx lr
+
+LOCAL_LABEL(sub_renorm_orunder):
+ // We come here if _either_ of these is true:
+ //
+ // 1. we've shifted the output mantissa left by one bit already but its top
+ // bit is still 0, so we must renormalise by more than 1 bit (and this
+ // may cause an underflow that we haven't detected yet)
+ //
+ // 2. (Arm only) we have detected an underflow already, not yet shifted the
+ // output mantissa at all, and haven't yet branched to fsub_underflow.
+
+ // Get the output sign bit by itself in r3. This is needed by the code below,
+ // and also used by fsub_underflow, so if we compute it before the (Arm-only)
+ // branch to fsub_underflow then it doesn't have to be duplicated there.
+ mov r3, r2, lsr #8 // r3 now has just the output sign, in bit 0
+
+#if !__thumb__
+ // Arm state: we did a combined check for cases 1 and 2 above, so this is
+ // where we separate them and go off to handle underflow in case 2. As stated
+ // above, the Z flag indicates an already-detected underflow.
+ beq LOCAL_LABEL(sub_underflow)
+#endif
+
+ // Now we know that we must renormalise by at least 2 bits, which may also
+ // give a denormal or zero result.
+ //
+ // This means no rounding can possibly be needed: if the subtraction cleared
+ // the top two bits of the mantissa, it means we computed A-B and found it
+ // was less than A/2, so B > A/2, so the exponent difference was at most 1.
+ // Hence the result mantissa fits in 24 bits even before renormalisation, and
+ // the top bit is clear, so it fits in 23 bits, i.e. it is exact.
+ //
+ // (That argument applies to the result before denormalisation. But any
+ // subtraction delivering a denormal result must also be exact: the inputs to
+ // subtraction are integer multiples of the smallest denormal, hence so is
+ // the result.)
+
+ // Start by shifting up by two bits (we already know the top 2 bits are
+ // clear). In the process, test if the entire mantissa is actually zero.
+ //
+ // If the mantissa is zero, we can safely return +0. (In default IEEE
+ // round-to-nearest mode, the only case of addition/subtraction that delivers
+ // -0 is if you add two zeroes _both_ of which are -0, or the equivalent
+ // subtraction. And those cases won't have come here, because they were
+ // additions of like-signed inputs or subtraction of opposite-signed inputs,
+ // so they go to fadd instead of fsub.)
+ movs r0, r0, lsr #2
+ bxeq lr // result is zero, which r0 already contains
+
+ // Determine how many more bits we need to shift the mantissa up, by counting
+ // its leading zeroes. Adjust the exponent, and shift the mantissa into its
+ // final position (assuming the output is still a normalised number).
+ clz r12, r0 // compute the shift / exponent adjustment
+ sub r2, r2, r12 // adjust exponent
+ lsl r0, r0, r12 // shift mantissa up to the top of the word
+ lsr r0, r0, #8 // and then down to its final position
+
+ // Check for underflow. This occurs precisely when the adjustment to the
+ // exponent in the bottom 8 bits of r2 carried into its sign bit (because at
+ // the moment the value in r2 is one lower than the true output exponent, so
+ // that adding the leading 1 bit in the mantissa will increment it back to
+ // the correct value). So we can check the sign bit in r2 against the copy of
+ // it we saved in r3 earlier. If no underflow, then we can just recombine the
+ // sign and exponent with the mantissa (no rounding is needed on this branch)
+ // and return.
+ teq r3, r2, lsr #8 // Exponent underflow?
+ addeq r0, r0, r2, lsl #23 // if so, trivially put the output back together
+ bxeq lr // and return
+
+ // Now we _have_ underflowed, and the out-of-range exponent stored in the low
+ // 8 bits of r2 tell us by how much: if it's -n, then we need to shift the
+ // normalised mantissa down by n bits. So to make the output denormal, all we
+ // have to do is to shift the mantissa down and recombine it with the
+ // original sign in r3.
+ //
+ // Bit 8 of r2 contains a corrupted version of the sign bit, but we can
+ // safely ignore that, because the semantics of AArch32 register-controlled
+ // shift instructions are that only the low 8 bits of the shift-count
+ // register are examined. So that sign bit is too high up to affect what
+ // happens.
+
+ rsb r2, r2, #0 // r2 is now the shift count
+LOCAL_LABEL(sub_do_underflow): // we can also come here from below
+ mov r0, r0, lsr r2 // shift the mantissa down
+ orr r0, r0, r3, lsl #31 // put the sign back on
+ bx lr // and return
+
+LOCAL_LABEL(sub_underflow):
+ // We come here if we detected underflow in the 'renormalise by 1 bit' case.
+ // So the input exponent must have been 1, and we shift the mantissa by only
+ // one bit. The only question is whether we put the output sign on: if the
+ // result is actually zero, we don't need to, because a subtraction giving a
+ // zero output always gives +0 (as mentioned above).
+ movs r0, r12, lsr #8 // Denormalise and check if result is zero
+ bxeq lr // Return +0 if result is zero
+#if __thumb__
+ // Get the output sign in r3. In Arm this was already done just after start
+ // of fsub_renorm_orunder, which all underflows went through. But in Thumb we
+ // might have come straight here without setting up r3.
+ mov r3, r2, lsr #8
+#endif
+ orr r0, r0, r3, lsl #31 // put the sign back on
+ bx lr // and return
+
+LOCAL_LABEL(sub_uncommon):
+ // We come here if the entry-point check says that at least one of x and y
+ // has an uncommon (FF or 00) exponent. So we have at least one NaN,
+ // infinity, denormal or zero, but we don't know which, or which operand it's
+ // in. And we could have any combination of those types of input, in _both_
+ // operands.
+
+ // Detect FF exponents (NaNs or infinities) and branch again for those.
+ mov r12, #0xFF000000
+ bics r2, r12, r0, lsl #1
+ bicsne r2, r12, r1, lsl #1
+ beq LOCAL_LABEL(sub_naninf)
+
+ // Now we know both inputs are finite, but there may be denormals or zeroes.
+ // So it's safe to do the same sign check and cross-jump as we did on the
+ // fast path.
+ teq r0, r1 // opposite signs?
+ eormi r1, r1, #1 << 31 // if so, negate the second operand
+ bmi LOCAL_LABEL(add_zerodenorm) // and cross-jump to the fadd version of this code
+
+LOCAL_LABEL(sub_zerodenorm):
+ // Now we know x and y have the same sign, and at least one of them is zero
+ // or denormal. If there aren't any zeroes, we'll end up rejoining the fast
+ // path, so we must set up all the same registers, and do our checks for zero
+ // in line with that.
+ //
+ // Start by exactly repeating the initial fast-path setup code: sort into
+ // magnitude order, get the output sign+exponent and the exponent shift.
+ subs r2, r0, r1 // compare inputs, also keeping x-y
+ eorlo r2, r2, #1 << 31 // if misordered, flip high bit of difference
+ sublo r0, r0, r2 // and use that to swap and sign-flip
+ addlo r1, r1, r2 // the two inputs
+ mov r2, r0, lsr #23 // r2 = sign<<8 + exponent
+ sub r3, r2, r1, lsr #23 // shift = 0..254 (sign bits cancel)
+
+ // Shift y's mantissa up to the top of r1. We know y has exponent 0 (at least
+ // one of the inputs does, and we've sorted them by now). So we definitely
+ // don't need to set the leading bit on y's mantissa; also, if r1 becomes
+ // zero, then we know we're subtracting 0 from x.
+ movs r1, r1, lsl #8
+ beq LOCAL_LABEL(sub_yzero)
+
+ // Now we know there aren't any zeroes, and that y is a denormal. x might or
+ // might not be a denormal, so we must check that and decide whether to set
+ // its top mantissa bit.
+ mov r0, r0, lsl #8 // shift mantissa of x to the top of r0
+ tst r2, #255 // is x's exponent 0? If so, it's denormal
+ orrne r0, r0, #1 << 31 // if not, set leading bit of x,
+ subne r3, r3, #1 // adjust exponent difference,
+
+ b LOCAL_LABEL(sub_dosub)
+
+LOCAL_LABEL(sub_yzero):
+ // Here, we know y = 0, so we're subtracting 0 from x. For most values of x,
+ // we return x unchanged: subtracting 0 makes no difference. But if x is
+ // _also_ 0 then we must return +0, rather than whatever x's sign of zero is.
+ // (Because +0 is always the sign of zero you return when subtracting a
+ // number from itself).
+ movs r12, r0, lsl #1 // test if x = 0 (bottom 31 bits all zero)
+ moveq r0, #0 // if so, replace x with +0
+ bx lr
+
+LOCAL_LABEL(sub_naninf):
+ // We come here if at least one input is a NaN or infinity. If either or both
+ // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+ // the input.
+ mov r12, #0xFF000000
+ cmp r12, r0, lsl #1 // if (r0 << 1) > 0xFF000000, r0 is a NaN
+ blo SYMBOL_NAME(__compiler_rt_fnan2)
+ cmp r12, r1, lsl #1
+ blo SYMBOL_NAME(__compiler_rt_fnan2)
+
+ // Otherwise, we have no NaNs and at least one infinity, so we're returning
+ // either infinity, or NaN for an (inf-inf) subtraction. We can safely handle
+ // all these cases by flipping the sign of y and going to fadd_inf.
+ eor r1, r1, #0x80000000
+ b LOCAL_LABEL(add_inf)
+
+END_COMPILERRT_FUNCTION(__aeabi_fsub)
NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/addsf3.S b/compiler-rt/lib/builtins/arm/thumb1/addsf3.S
new file mode 100644
index 0000000000000..808f154884980
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/addsf3.S
@@ -0,0 +1,888 @@
+//===-- addsf3.S - Add/subtract single precision floating point numbers ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the __addsf3 and __subsf3 functions (single precision
+// floating point number addition and subtraction), with the IEEE-754 default
+// rounding (to nearest, ties to even), for the Thumb1 ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+ .syntax unified
+ .text
+ .p2align 2
+
+// General structure of this code:
+//
+// There are three actual entry points here, for addition, subtraction and
+// reversed subtraction (just taking the operands the other way round, so that
+// it returns y-x instead of x-y). But the first thing the functions do (after
+// checking for NaNs) is to sort out whether the magnitudes of the two inputs
+// are being added (x+y with like signs, or x-y with different signs), or
+// subtracted. So fadd jumps across into the middle of fsub if it sees that the
+// signs are different, and vice versa. Then the main code path in fadd handles
+// magnitude addition, and the one in fsub handles magnitude subtraction.
+//
+// NaNs are checked first, so that an input NaN can be propagated exactly,
+// including its sign bit. After ruling out that case, it's safe to flip the
+// sign of one of the inputs, so that during the cross-calls, x - y can be
+// rewritten as x + (-y) and vice versa.
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__addsf3)
+ push {r4, lr}
+ vmov r0, s0
+ vmov r1, s1
+ bl __aeabi_fadd
+ vmov s0, r0
+ pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__addsf3, __aeabi_fadd)
+#endif
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_fadd)
+ push {r4,r5,r6,lr}
+
+ movs r5, #1
+ lsls r5, r5, #31 // all cross-branches will expect to have r5==0x80000000
+
+ // Extract the exponents into r2 and r3. In the process, test for all
+ // uncommon values (infinities, NaNs, denormals and zeroes) and branch out of
+ // line if any are found.
+ //
+ // Uncommon operands with exponent 0xFF (NaNs and infinities) "win" over
+ // those with exponent 0 (zeroes and denormals), in the sense that if there's
+ // one of each, the 0xFF one determines the result. But we check for exponent
+ // 0 first, because that way we get it as a by-product of extracting the
+ // exponents in the first place without needing a separate compare
+ // instruction. So the zero/denorm handler will have to finish up the NaN
+ // check as its first task.
+ lsls r2, r0, #1
+ lsls r3, r1, #1
+ lsrs r2, r2, #24
+ beq LOCAL_LABEL(add_zerodenorm_x)
+ lsrs r3, r3, #24
+ beq LOCAL_LABEL(add_zerodenorm_y)
+ cmp r2, #255
+ beq LOCAL_LABEL(add_naninf)
+ cmp r3, #255
+ beq LOCAL_LABEL(add_naninf)
+
+ // Now we have two normalised numbers. If their signs are opposite, we should
+ // be subtracting their magnitudes rather than adding, so cross-jump to fsub
+ // (via a trampoline that negates y).
+ movs r4, r0
+ eors r4, r4, r1 // set N if signs are unequal
+ bmi LOCAL_LABEL(add_sub)
+LOCAL_LABEL(add_magnitude):
+ // If we get here, we're adding operands with equal signs (i.e. a magnitude
+ // addition). First thing to do is put the operands in magnitude order, so
+ // that x >= y.
+ subs r4, r0, r1
+ bhs LOCAL_LABEL(add_swapped)
+ subs r0, r0, r4
+ adds r1, r1, r4
+ // We must also swap the pre-extracted exponents here.
+ eors r2, r2, r3
+ eors r3, r3, r2
+ eors r2, r2, r3
+LOCAL_LABEL(add_swapped):
+ // Keep the sign and exponent of the larger input, to use as the sign and
+ // exponent of the output (up to carries and overflows). Also calculate the
+ // exponent difference, which tells us how far we'll need to shift y's
+ // mantissa right to add it to x's.
+ lsrs r6, r0, #23
+ subs r3, r2, r3
+
+ // Extract both mantissas, moved up to the top of the word, with the leading
+ // 1 made explicit. We put y's extracted mantissa in a different register
+ // (r4), because we'll want to keep the original y for use in fadd_check_rte.
+ lsls r0, r0, #8
+ lsls r4, r1, #8
+ orrs r0, r0, r5
+ orrs r4, r4, r5
+
+LOCAL_LABEL(add_doadd):
+ // Here we perform the actual addition. We either fell through from the code
+ // above, or jumped back to here after handling an input denormal.
+ //
+ // We get here with:
+ // Operands known to be numeric rather than zero/infinity/NaN;
+ // r0 = mantissa of larger operand (in high 24 bits);
+ // r4 = mantissa of smaller operand (in high 24 bits);
+ // r1 = original (or nearly so) smaller operand;
+ // r6 = result sign and exponent (in low 9 bits);
+ // r2 = exponent of x
+ // r3 = exponent difference.
+ //
+ // For normal inputs, the mantissa registers (r0,r4) will have the top bit
+ // set. Denormals will leave that bit clear, treating the number as
+ // 0.[mantissa] x 2^(fixed exponent) instead of renormalising to 1.[mantissa]
+ // x 2^(variable exponent) as a multiplication would want.
+
+ // Actually shift the smaller mantissa downwards and add them together.
+ lsrs r4, r4, r3
+ adds r5, r0, r4
+
+ // If that addition carried off the top of r5, then the number has increased
+ // its exponent. Diverge into a completely separate code path for that case,
+ // because there we must check for overflow. We'll return to the label below
+ // if no overflow.
+ bcs LOCAL_LABEL(add_carry)
+LOCAL_LABEL(add_renormed):
+ // Now we have the output mantissa in r5, with the leading bit at position
+ // 31. The precise sum may be slightly more than that, if r4 != (y << r3).
+ //
+ // Shift the mantissa down to its final position, and use the carry flag (bit
+ // shifted off the bottom) to see if we need to round.
+ lsrs r0, r5, #8
+ bcc LOCAL_LABEL(add_rounded)
+
+ // If we fall through to here, then we need to round up, and also check if we
+ // need to round to even. This occurs if all the bits of y's mantissa shifted
+ // off the bottom are zero except for the round bit.
+ //
+ // Some of those bits are in r5 (the 32-bit version of the sum's mantissa).
+ // It's cheap to check those, and should exclude _most_ cases where
+ // round-to-even isn't needed.
+ adds r0, r0, #1 // simple round up
+ lsls r5, r5, #(32-7) // check top 7 bits
+ beq LOCAL_LABEL(add_check_rte) // if those are zero, go to full RTE check
+LOCAL_LABEL(add_rounded):
+ // Put the sign+exponent back on. The leading bit of the mantissa increments
+ // the exponent field unwantedly, so we must decrement r6 first to compensate
+ // for that.
+ subs r6, r6, #1
+ lsls r6, r6, #23
+ adds r0, r0, r6
+ // If we haven't overflowed, it's now safe to return.
+ cmp r2, #255
+ bge LOCAL_LABEL(add_overflow)
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_overflow):
+ // We have overflow, so we need to return an infinity of the correct sign. r0
+ // already has the correct sign and exponent, so all we need to do is clear
+ // its mantissa.
+ lsrs r0, r0, #23
+ lsls r0, r0, #23
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_sub):
+ // We come here when fadd discovered it needed to subtract. Negate the second
+ // operand and cross-jump into fsub.
+ //
+ // The cross-jump is done using BL, for greater branch range. That clobbers
+ // lr, but that's OK, we weren't keeping anything in it at this point.
+ eors r1, r1, r5
+ bl LOCAL_LABEL(sub_magnitude)
+
+LOCAL_LABEL(add_carry):
+ // We come here if we carried a 1 bit off the top of r5 where we computed the
+ // sum's mantissa. Shift back down by one and put a 1 bit in at the top.
+ //
+ // That would be easy with the RRX instruction from general AArch32, but we
+ // don't have that here. Instead we OR in a 1 at the bottom, and move it to
+ // the top by rotating right.
+ //
+ // A danger of shifting r5 down by a bit is that we lose the bit at the very
+ // bottom, which might be important if it's the only nonzero bit below the
+ // output mantissa, because then it determines whether we do RTE or not.
+ // Fortunately, another copy of the same bit is still at the bottom of r4
+ // (the shifted version of y's mantissa which we added to x's to make the
+ // version of r5 _before_ we shifted it down). So the full RTE check will
+ // have to remember to check that bit.
+ movs r0, #1
+ orrs r5, r5, r0 // set low bit of r5
+ rors r5, r5, r0 // and rotate right so that's now the high bit
+
+ // Carrying off the top of the mantissa means that the output exponent must
+ // be increased by 1. Increment both copies: the exponent by itself in r2
+ // (used for overflow checking) and the exponent + sign in r6.
+ adds r2, r2, #1
+ adds r6, r6, #1
+
+ // Now go back to the common code path for rounding and overflow checking.
+ b LOCAL_LABEL(add_renormed)
+
+LOCAL_LABEL(add_check_rte):
+ // We come here to do the full (and therefore expensive) check for round-to-
+ // even: is our output number exactly on a rounding boundary, half way
+ // between two representable numbers? That is, of the bits _not_ included in
+ // the output mantissa, is the topmost bit 1 and all the rest 0?
+ //
+ // We only come here at all if we have already rounded the number up. So we
+ // already know the topmost one of the lost bits is 1, and all we have to
+ // check is whether the rest are 0.
+ //
+ // Also, we've already checked all the bits that were still in the 32-bit
+ // version of the output mantissa, so we don't need to check those again ...
+ //
+ // ... well, _nearly_ all, because in the fadd_carry case, we shifted r5 down
+ // by a bit _before_ that check. So we do need to re-check that one bit.
+ //
+ // The basic strategy is: r4 still contains the version of y's mantissa that
+ // we shifted down before adding it to x. And r1 contains more or less the
+ // original version of all of y, including the same mantissa. So if we shift
+ // r4 back up again and XOR it with r1, we clear all the bits that we've
+ // already checked, and leave only the ones we haven't.
+
+ // Start by deliberately throwing away the low bit of r4, in case that
+ // corresponded to the bit we lost off the bottom of r5 in fadd_carry. This
+ // means we won't clear it in the XOR, and therefore, _will_ check it.
+ lsrs r4, r4, #1
+
+ // Shift r4 back up by the same amount we shifted it down, and shift r1 to
+ // the corresponding position, so that we can XOR them. The most convenient
+ // way to do this is not to modify the variable shift count in r3, and
+ // compensate for it by selecting the shift of r1 appropriately.
+ //
+ // As it happens, we end up with the implicit leading 1 bit of the mantissa
+ // in bit 30 of the result - or rather, it would be if we'd set it, which in
+ // r1 we haven't, because that's still the whole original input float.
+ lsls r4, r4, r3
+ lsls r1, r1, #7
+ eors r1, r1, r4
+
+ // But r1 wasn't just the mantissa of y; it also had the exponent, and its
+ // leading bit was implicit. So the topmost two bits of r1 are useless: in r1
+ // they're part of the exponent field. Exclude them from consideration.
+ //
+ // This doesn't lead to dropping any bit we really care about, because we're
+ // never interested in the actual leading 1 bit of y's mantissa for round-to-
+ // even purposes. Why not? Because we already know the round bit (the one
+ // just off the bottom of the output mantissa) is a 1, which must have come
+ // from y (it's too low down to come from x), and we only care about checking
+ // all the bits below _that_. So y's leading 1 must be at least as high up as
+ // the round bit, and therefore, isn't one of the bits we currently need to
+ // check.
+ lsls r1, r1, #2
+
+ // Now if all those bits are zero, we're rounding to even. If _not_, we're
+ // finished rounding, so go back to fadd_rounded to continue the main code
+ // path.
+ bne LOCAL_LABEL(add_rounded)
+
+ // Clear the low bit of the output (rounding to even) and go back to the main
+ // code path.
+ movs r4, #1
+ bics r0, r0, r4
+ b LOCAL_LABEL(add_rounded)
+
+LOCAL_LABEL(add_naninf):
+ // We come here if at least one input is a NaN or infinity. If either or both
+ // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+ // the input.
+ //
+ // On entry, we know r5 = 0x80000000 from the initial uncommon check. Also,
+ // we already extracted the exponents of x and y into r2 and r3.
+ asrs r4, r5, #7 // so r4 = 0xFF000000
+ lsls r6, r0, #1 // r6 > r4 iff x is NaN
+ cmp r6, r4
+ bhi LOCAL_LABEL(add_nan)
+ lsls r6, r1, #1 // r6 > r4 iff y is NaN
+ cmp r6, r4
+ bhi LOCAL_LABEL(add_nan)
+
+ // No NaNs, so we have at least one infinity. Almost all additions involving
+ // an infinity return the input infinity unchanged. The only exception is if
+ // there are two infinities that have opposite signs (which can happen even
+ // inf fadd, since on this code path we haven't cross-jumped into fsub),
+ // where we return NaN.
+ cmp r2, r3 // at least one exponent is 0xFF, so if EQ, both are
+ beq LOCAL_LABEL(add_infinf) // and therefore we're adding infinity to infinity
+
+ // With one infinity, we just find which register it's in, and return it.
+ cmp r2, #255
+ beq LOCAL_LABEL(add_ret_exact) // just return x
+LOCAL_LABEL(add_retb): // we reuse this code in the denormal handler
+ movs r0, r1 // otherwise, return y
+LOCAL_LABEL(add_ret_exact):
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_infinf):
+ // With two infinities, we must check their relative sign. If they're the
+ // same sign, we have no problem.
+ movs r4, r0
+ eors r4, r4, r1
+ bpl LOCAL_LABEL(add_ret_exact) // identical infinities, so just return one
+
+ // But if we're adding two infinities of opposite sign, make a default quiet
+ // NaN and return that.
+ ldr r0, =0x7fc00000
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_nan):
+ bl SYMBOL_NAME(__compiler_rt_fnan2)
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_zerodenorm_x):
+ // We come here if we found x was 0 or a denormal. We haven't set up r3 as
+ // the exponent of y yet.
+ lsrs r3, r3, #24
+
+ // Also, we checked for zero/denorm before checking for infinities and NaNs.
+ // We know x isn't an infinity or NaN, but we must check y.
+ cmp r3, #255
+ beq LOCAL_LABEL(add_naninf)
+
+ // Fall through to the next section. This repeats a pointless check for x
+ // being NaN or infinity, but it would cost more cycles to branch round it.
+
+LOCAL_LABEL(add_zerodenorm_y):
+ // We come here if we found y was 0 or a denormal, but also by falling
+ // through from above. So we may not yet have checked x for infinity/NaN. But
+ // we have checked that y isn't.
+ cmp r2, #255
+ beq LOCAL_LABEL(add_naninf)
+
+ // Now at least one of x,y is zero or denormal, and neither is infinite or
+ // NaN. We haven't yet checked the signs and cross-jumped to fsub, but we can
+ // handle all the zero cases without having to:
+ //
+ // - if x = -y (including both being zero), return 0 of the appropriate sign
+ // - if x = 0, return y (including the case of same-signed zeroes)
+ // - if y = 0, return x
+ subs r6, r0, r1 // are x and y equal
+ cmp r6, r5 // except for opposite sign bits? (r5 = 0x80000000)
+ beq LOCAL_LABEL(add_diffsame)
+ lsls r6, r1, #1 // is y zero?
+ beq LOCAL_LABEL(add_ret_exact) // if so, return x
+ lsls r6, r0, #1 // is x zero?
+ beq LOCAL_LABEL(add_retb) // if so, return y
+
+ // Now we've dealt with all the possibilities involving zeroes, so we have
+ // either one denormal or two denormals. These cases are harder, and we don't
+ // want to handle both signs at once, so check the signs and cross-branch
+ // into fsub if they're different.
+ movs r6, r1
+ eors r6, r6, r0
+ bpl LOCAL_LABEL(add_denorm)
+ eors r1, r1, r5
+ bl LOCAL_LABEL(sub_denorm)
+LOCAL_LABEL(add_denorm):
+ // Sort the operands into magnitude order. Now we know they have the same
+ // sign, unsigned comparison is good enough for that.
+ subs r6, r0, r1
+ bhs 0f
+ subs r0, r0, r6
+ adds r1, r1, r6
+0:
+
+ // We know one exponent is 0, so check if the other is too. We do this by
+ // adding the two exponents together, achieving two things in one
+ // instruction: it gets the nonzero exponent (if any) into r2 (saving us
+ // swapping r2 with r3 in the sorting step above), and it sets Z if both were
+ // zero.
+ adds r2, r2, r3
+ beq LOCAL_LABEL(add_denorm2)
+
+ // Now exactly one operand is denormal, and it's y. We must go back to
+ // fadd_doadd with all the registers appropriately set up.
+ lsrs r6, r0, #23 // r6 == sign and exponent of x
+ lsls r4, r1, #8 // r4 == mantissa of y, with leading bit clear
+ lsls r0, r0, #8
+ orrs r0, r0, r5 // set high bit on mantissa of x
+ subs r3, r2, #1 // denormals are shifted as if they had exponent 1
+ b LOCAL_LABEL(add_doadd)
+
+LOCAL_LABEL(add_diffsame):
+ // Here we only support round-to-nearest mode, so the difference of two
+ // identical things always returns +0.
+ movs r0, #0
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_denorm2):
+ // Here, x,y are both denormal, and we know we're doing magnitude addition.
+ // So we can add the mantissas like ordinary integers, and if they carry into
+ // the exponent, that's still the correct answer. But we have to avoid adding
+ // two copies of the sign bit, so we clear that from y first.
+ bics r1, r1, r5 // clear sign bit of y
+ adds r0, r0, r1 // add mantissas
+ pop {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__aeabi_fadd)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_frsub)
+ // Reversed subtraction, that is, compute y-x, where x is in r0 and y in r1.
+ //
+ // We could implement this by simply swapping r0 with r1. But the point of
+ // having a reversed-subtract in the first place is to avoid the caller
+ // having to do that, so if we do it ourselves, it wastes all the time they
+ // saved. So instead, on the fast path, we redo the sign check our own way
+ // and branch to fadd_magnitude or fsub_magnitude.
+
+ push {r4,r5,r6,lr}
+
+ movs r5, #1
+ lsls r5, r5, #31 // all cross-branches will expect to have r5 = 0x80000000
+
+ // Extract the exponents and test for uncommon values. Note that we do the
+ // zero/denormal tests the opposite way round from fsub, because we swap the
+ // operands before branching to the corresponding fsub code, so this way our
+ // first branch will enter fsub with the first of _its_ operands checked.
+ lsls r2, r0, #1
+ lsls r3, r1, #1
+ lsrs r3, r3, #24
+ beq LOCAL_LABEL(rsb_zerodenorm_y)
+ lsrs r2, r2, #24
+ beq LOCAL_LABEL(rsb_zerodenorm_x)
+ cmp r2, #255
+ beq LOCAL_LABEL(rsb_naninf)
+ cmp r3, #255
+ beq LOCAL_LABEL(rsb_naninf)
+
+ // Decide which of fadd_magnitude and fsub_magnitude to branch to, and do so.
+ eors r0, r0, r5
+ movs r4, r0
+ eors r4, r4, r1
+ bpl LOCAL_LABEL(rsb_add)
+ eors r1, r1, r5
+ bl LOCAL_LABEL(sub_magnitude)
+LOCAL_LABEL(rsb_add):
+ bl LOCAL_LABEL(add_magnitude)
+
+ // Any uncommon operands to frsub are handled by just swapping the two
+ // operands and going to fsub's handler. We're off the main fast path now, so
+ // there's no need to try to optimise it any harder.
+LOCAL_LABEL(rsb_zerodenorm_y):
+ push {r0,r2}
+ push {r1,r3}
+ pop {r0,r2}
+ pop {r1,r3}
+ bl LOCAL_LABEL(sub_zerodenorm_x) // we just swapped x and y, so now x is 0/denorm
+LOCAL_LABEL(rsb_zerodenorm_x):
+ push {r0,r2}
+ push {r1,r3}
+ pop {r0,r2}
+ pop {r1,r3}
+ bl LOCAL_LABEL(sub_zerodenorm_y) // similarly, now we know y is
+LOCAL_LABEL(rsb_naninf):
+ push {r0,r2}
+ push {r1,r3}
+ pop {r0,r2}
+ pop {r1,r3}
+ bl LOCAL_LABEL(sub_naninf)
+
+END_COMPILERRT_FUNCTION(__aeabi_frsub)
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__subsf3)
+ push {r4, lr}
+ vmov r0, s0
+ vmov r1, s1
+ bl __aeabi_fsub
+ vmov s0, r0
+ pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__subsf3, __aeabi_fsub)
+#endif
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_fsub)
+ // Main entry point for subtraction.
+ push {r4,r5,r6,lr}
+
+ movs r5, #1
+ lsls r5, r5, #31
+
+ // Extract the exponents into r2 and r3 and test for all uncommon values,
+ // similarly to fadd.
+ lsls r2, r0, #1
+ lsls r3, r1, #1
+ lsrs r2, r2, #24
+ beq LOCAL_LABEL(sub_zerodenorm_x)
+ lsrs r3, r3, #24
+ beq LOCAL_LABEL(sub_zerodenorm_y)
+ cmp r2, #255
+ beq LOCAL_LABEL(sub_naninf)
+ cmp r3, #255
+ beq LOCAL_LABEL(sub_naninf)
+
+ // Check the signs, and if they're unequal, cross-jump into fadd to do
+ // magnitude addition. (Now we've excluded NaNs, it's safe to flip the sign
+ // of y.)
+ movs r4, r0
+ eors r4, r4, r1
+ bmi LOCAL_LABEL(sub_add)
+LOCAL_LABEL(sub_magnitude):
+ // If we get here, we're subtracting operands with equal signs (i.e. a
+ // magnitude subtraction). First thing to do is put operands in magnitude
+ // order, so that x >= y. However, if they are swapped, we must also negate
+ // both of them, since A - B = (-B) - (-A).
+ subs r4, r0, r1
+ bhs LOCAL_LABEL(sub_swapped)
+ eors r4, r4, r5
+ subs r0, r0, r4
+ adds r1, r1, r4
+ // We must also swap the pre-extracted exponents here.
+ eors r2, r2, r3
+ eors r3, r3, r2
+ eors r2, r2, r3
+LOCAL_LABEL(sub_swapped):
+ // Save the sign and exponent of the larger operand to use for the result (up
+ // to renormalisation), and calculate the exponent difference for shifting
+ // one mantissa relative to the other.
+ lsrs r6, r0, #23
+ subs r3, r2, r3
+
+ // Shift the mantissas up to the top of the words. In the process we put y's
+ // shifted mantissa into a separate register, keeping the original for later
+ // reference. Also, although we set the leading bit of y, we _clear_ the
+ // leading bit of x, which is just as quick and saves us having to decrement
+ // the output exponent later to compensate.
+ lsls r0, r0, #8
+ lsls r4, r1, #8
+ bics r0, r0, r5
+ orrs r4, r4, r5
+
+LOCAL_LABEL(sub_dosub): // we may come back here after sorting out denorms
+
+ // We get here with:
+ // Operands known to be numeric rather than zero/infinity/NaN;
+ // r0 = mantissa of larger operand (in top 24 bits, with high bit clear)
+ // r4 = mantissa of smaller operand (in top 24 bits, with high bit set)
+ // r1 = original smaller operand (up to maybe a sign flip)
+ // r6 = result sign/exponent (in low 9 bits)
+ // r2 = plain result exponent (in low 8 bits, i.e. r6 & 0xFF)
+ // r3 = exponent difference.
+ //
+ // Begin calculating the output mantissa by shifting y's mantissa right and
+ // subtracting. This may leave the mantissa too large by one, if the bits
+ // shifted out of y are nonzero. We correct this during rounding if
+ // necessary.
+ lsrs r4, r4, r3
+ subs r5, r0, r4
+
+ // This may have cleared the high bit of the output mantissa, in which case
+ // we must renormalise. Our strategy is to split into three code paths, on
+ // two of which an awkward case is known not to arise:
+ // * no need to renormalise at all => underflow can't happen
+ // * shift up by exactly 1 bit
+ // * shift up by more than 1 bit => rounding can't happen (result is exact)
+ //
+ // First branch out of line for the first case, which we can detect because
+ // the N flag tells us whether the top mantissa bit is still set.
+ bpl LOCAL_LABEL(sub_renormed)
+
+ // Renormalise by one bit, and check the new top bit to see if we need to
+ // renormalise by more than that.
+ lsls r5, r5, #1
+ bpl LOCAL_LABEL(sub_renorm_big) // if new top bit still clear, renormalise by more
+ // Decrement both exponent registers (r6 with the sign, r2 without). We
+ // decrement r6 by 2 instead of 1, because now the output mantissa has the
+ // top bit set, so we must compensate when we put the sign and exponent back
+ // on.
+ //
+ // The extra decrement of r6 might carry into the sign bit. This doesn't
+ // matter on the fast path, because the leading bit in the mantissa will undo
+ // it. But we need to account for it in the underflow handler for this path.
+ subs r6, r6, #2
+ subs r2, r2, #1
+ // The decrement of the pure exponent value also doubles as a check for
+ // underflow, because we underflowed precisely if the exponent went to 0.
+ beq LOCAL_LABEL(sub_underflow_1)
+LOCAL_LABEL(sub_renormed):
+ // Now we have the output mantissa in r5. It may or may not have the high bit
+ // set, depending on which branch of the code we've come through. But r6 has
+ // been adjusted appropriately, so that we can make a basically right output
+ // value (before rounding) by adding r6 << 23 to r5 >> 8.
+ //
+ // If any nonzero bits were shifted off the bottom of y, then the true value
+ // of the output mantissa might be slightly _less_ than the value in r5.
+ // However the maximum difference is about 2^{-7} ULP relative to the final
+ // result (because it's at most one ULP of the 32-bit output mantissa in r5).
+ // So it doesn't affect the result in round-to-nearest mode unless it puts us
+ // just below a rounding boundary, which means we can ignore it until the
+ // full round-to-even check.
+ lsls r6, r6, #23 // prepare sign and exponent
+ lsrs r0, r5, #8 // shift down, and put the round bit into C
+ bcs LOCAL_LABEL(sub_round) // diverge based on round bit
+ // If the round bit shifted off the bottom of r5 was clear, then we're not
+ // rounding up, so we can make the output value and finish immediately.
+ adds r0, r0, r6 // reconstitute output value without rounding
+ pop {r4,r5,r6,pc}
+LOCAL_LABEL(sub_round):
+ // Otherwise, we're rounding, in three stages. First round up; then cheaply
+ // check the low bits of r5 (the 32-bit version of the mantissa) so that we
+ // can rule out round-to-even if any of those is nonzero; finally, in as few
+ // cases as possible, check the rest of y's mantissa to check for RTE fully.
+ adcs r0, r0, r6 // reconstitute output value while rounding up
+ lsls r5, r5, #(32-7) // check first 7 guard bits
+ beq LOCAL_LABEL(sub_check_rte) // if the're all 0, do the full check for RTE
+ pop {r4,r5,r6,pc} // otherwise we're done
+
+LOCAL_LABEL(sub_add):
+ // Trampoline to cross-jump to fadd, because a 16-bit branch won't reach that
+ // far. Also a convenient place to flip y's sign, so we only have to do it
+ // once.
+ eors r1, r1, r5 // we know r5 = 0x80000000
+ bl LOCAL_LABEL(add_magnitude) // clobbers lr, which doesn't matter
+
+LOCAL_LABEL(sub_check_rte):
+ // Full check for round-to-even, in the same style as fadd_check_rte: r4
+ // still contains the version of y's mantissa that we shifted down before
+ // subtracting from x, and r1 contains the original version of that mantissa.
+ // So if we shift r4 back up again and XOR it with r1, we clear all the bits
+ // that we've already checked, and leave only the ones we haven't. The only
+ // exception is the leading mantissa bit, which is implicit in r1, but this
+ // can never affect round-to-even, because if we rounded at all then the
+ // round bit must have come from y, so the leading bit of y is at the round
+ // bit or above, hence not one of the bits we're checking for RTE.
+ lsls r4, r4, r3 // undo the shift of y's mantissa
+ lsls r1, r1, #8 // shift y's original mantissa back to the same place
+ eors r1, r1, r4 // find any differences
+ lsls r1, r1, #1 // but ignore the leading mantissa bit
+ beq LOCAL_LABEL(sub_rte) // if all bits now clear, we're rounding to even
+
+ // If we're not RTEing, we must undo the simplistic rounding we've already
+ // done. (We incremented the result based on the belief that the shifted-off
+ // data started 0x80xxx, but it turns out that xxx is slightly negative, so
+ // actually we had 0x7Fyyy.)
+ subs r0, r0, #1
+ pop {r4,r5,r6,pc}
+LOCAL_LABEL(sub_rte):
+ // Actually round to even, by clearing the low bit of the output.
+ movs r4, #1
+ bics r0, r0, r4
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_renorm_big):
+ // Now we know that we must renormalise by at least 2 bits, which may also
+ // give a denormal or zero result.
+ //
+ // This means no rounding can possibly be needed: if the subtraction cleared
+ // the top two bits of the mantissa, it means we computed A-B and found it
+ // was less than A/2, so B > A/2, so the exponent difference was at most 1.
+ // Hence the result mantissa fits in 24 bits even before renormalisation, and
+ // the top bit is clear, so it fits in 23 bits, i.e. it is exact.
+
+ // Detect an actual zero result, and go and return it.
+ beq LOCAL_LABEL(sub_diffsame)
+
+ // Renormalise by binary search. (16-bit Thumb has no CLZ instruction.) We'll
+ // accumulate the total exponent adjustment in r0. It starts at 1 rather than
+ // 0, because we've shifted the mantissa left by one bit already.
+ movs r0, #1
+
+ // If the top 16 bits of r5 are clear, shift up by 16 and adjust r0 to match.
+ lsrs r3, r5, #(32-16)
+ bne 0f
+ lsls r5, r5, #16
+ adds r0, r0, #16
+0:
+ // Same for 8 bits
+ lsrs r3, r5, #(32-8)
+ bne 0f
+ lsls r5, r5, #8
+ adds r0, r0, #8
+0:
+ // 4 bits
+ lsrs r3, r5, #(32-4)
+ bne 0f
+ lsls r5, r5, #4
+ adds r0, r0, #4
+0:
+ // 2 bits
+ lsrs r3, r5, #(32-2)
+ bne 0f
+ lsls r5, r5, #2
+ adds r0, r0, #2
+0:
+ // 1 bit
+ lsrs r3, r5, #(32-1)
+ bne 0f
+ lsls r5, r5, #1
+ adds r0, r0, #1
+0:
+
+ // Update our two copies of the exponent (with sign in r6, without in r2).
+ subs r6, r6, r0
+ subs r2, r2, r0
+ // Shift the mantissa and exponent into the right places to combine them.
+ lsls r4, r5, #1 // clear leading bit of mantissa
+ lsrs r0, r4, #9 // and shift it down
+ lsls r4, r6, #23 // shift sign and exponent up
+ adds r0, r0, r4 // put them together
+ // Check for underflow, which occurs if the output exponent is less than 1
+ // (including having gone negative).
+ cmp r2, #1
+ blt LOCAL_LABEL(sub_underflow_2)
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_diffsame):
+ // Here we only support round-to-nearest mode, so the difference of two
+ // identical things always returns +0.
+ movs r0, #0
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_underflow_1):
+ // We come here if renormalising by one bit reduced the output exponent to
+ // zero. In other words, the output value in x is denormal (hence exact) and
+ // wants shifting down by exactly 9 bits (8 bits of exponent plus the bit we
+ // already shifted it by), and then the sign bit putting back on.
+ //
+ // Also, before we get the sign bit from r6, we must add 1 to it, because of
+ // the possibility that decrementing it carried into the sign bit.
+ adds r6, r6, #1 // undo potential sign-flipping carry
+ lsrs r6, r6, #8 // isolate the sign bit
+ lsls r6, r6, #31 // and shift it up to the top
+ lsrs r0, r5, #9 // construct the output mantissa
+ orrs r0, r0, r6 // and combine with the sign bit
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_underflow_2):
+ // We come here if multi-bit renormalisation found a denormal. The mantissa
+ // has its leading bit set at the top of r5, so it needs shifting down 8 bits
+ // to where it would be in a normalised number, and then further: if the
+ // output exponent is 0 (meaning the exponent just below a normalised number)
+ // then we shift one extra bit, if it's -1 then we shift two extra bits, and
+ // so on. So in total we shift down by 8 + (1 - exp) = 9 - exp.
+ rsbs r4, r6, #0
+ adds r4, r4, #9
+ lsrs r5, r5, r4 // shift mantissa into place
+
+ // Extract the sign bit from r6 and combine it with that denormal. r6 could
+ // be 0 or could be negative, so we must add enough to it to make it reliably
+ // positive. Any offset that works is fine; we'll use 0xc0, which is the
+ // offset used by IEEE 754:1985 underflow intermediate values.
+ adds r6, r6, #0xc0 // rebias to correct sign bit
+ lsrs r6, r6, #8 // isolate the sign bit
+ lsls r0, r6, #31 // and shift it up to the top
+ adds r0, r0, r5 // combine with the denormalised mantissa
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_naninf):
+ // We come here if at least one input is a NaN or infinity. If either or both
+ // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+ // the input.
+ // We come here if at least one of x,y is a NaN or infinity.
+ // Their exponents are reliably always in r2 and r3
+ // respectively.
+ asrs r4, r5, #7 // so r4 = 0xFF000000
+ lsls r6, r0, #1 // r6 > r4 iff x is NaN
+ cmp r6, r4
+ bhi LOCAL_LABEL(sub_nan)
+ lsls r6, r1, #1 // r6 > r4 iff y is NaN
+ cmp r6, r4
+ bhi LOCAL_LABEL(sub_nan)
+
+ // No NaNs, so we have at least one infinity. Almost all additions involving
+ // an infinity return the input infinity unchanged. The only exception is
+ // subtracting two infinities that have the same sign, where we return NaN.
+ cmp r2, r3 // at least one exponent is 0xFF, so if EQ, both are
+ beq LOCAL_LABEL(sub_infinf)
+
+ // If x is infinite and y is finite, return x.
+ cmp r2, #255
+ beq LOCAL_LABEL(sub_ret_exact)
+LOCAL_LABEL(sub_retminusy):
+ // If x is finite and y is infinite, return -y.
+ movs r0, r1
+ eors r0, r0, r5 // negate y
+LOCAL_LABEL(sub_retx):
+LOCAL_LABEL(sub_ret_exact):
+ pop {r4,r5,r6,pc}
+LOCAL_LABEL(sub_infinf):
+ // With two infinities, we must check their relative sign. If they have
+ // opposite sign, we just return x (which is the one with the same sign as
+ // the output).
+ movs r4, r0
+ eors r4, r4, r1
+ bmi LOCAL_LABEL(sub_ret_exact)
+
+ // But if we're subtracting two infinities of the same sign, make a default
+ // quiet NaN and return that.
+ ldr r0, =0x7fc00000
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_nan):
+ bl SYMBOL_NAME(__compiler_rt_fnan2)
+ pop {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_zerodenorm_x):
+ // We come here if we found x was 0 or a denormal. We haven't set up r3 as
+ // the exponent of y yet.
+ lsrs r3, r3, #24
+
+ // Also, we checked for zero/denorm before checking for infinities and NaNs.
+ // We know x isn't an infinity or NaN, but we must check y.
+ cmp r3, #255
+ beq LOCAL_LABEL(sub_naninf)
+
+ // Fall through to the next section. This repeats a pointless check for x
+ // being NaN or infinity, but it would cost more cycles to branch round it.
+
+LOCAL_LABEL(sub_zerodenorm_y):
+ // We come here if we found y was 0 or a denormal, but also by falling
+ // through from above. So we may not yet have checked x for infinity/NaN. But
+ // we have checked that y isn't.
+ cmp r2, #255
+ beq LOCAL_LABEL(sub_naninf)
+
+ // Now at least one of x,y is zero or denormal, and neither is infinite or
+ // NaN. We haven't yet checked the signs and cross-jumped to fsub, but we can
+ // handle all the zero cases without having to:
+ //
+ // - if x = -y (including both being zero), return 0 of the appropriate sign
+ // - if y = 0, return x (including the case of oppositely signed zeroes)
+ // - if x = 0 and y != 0, return -y
+ cmp r0, r1 // are x and y equal?
+ beq LOCAL_LABEL(sub_diffsame)
+ lsls r6, r1, #1 // is y zero?
+ beq LOCAL_LABEL(sub_retx) // if so, return x
+ lsls r6, r0, #1 // is x zero?
+ beq LOCAL_LABEL(sub_retminusy) // if so, return -y
+
+ // Now we've dealt with all the possibilities involving zeroes, so we have
+ // either one denormal or two denormals. These cases are harder, and we don't
+ // want to handle both signs at once, so check the signs and cross-branch
+ // into fadd if they're different.
+ movs r6, r1
+ eors r6, r6, r0
+ bpl LOCAL_LABEL(sub_denorm)
+ eors r1, r1, r5
+ bl LOCAL_LABEL(add_denorm)
+LOCAL_LABEL(sub_denorm):
+ // Sort the operands into magnitude order. Now we know they have the same
+ // sign, unsigned comparison is good enough for that.
+ subs r6, r0, r1
+ bhs 0f
+ eors r6, r6, r5 // flip the signs in the process
+ subs r0, r0, r6
+ adds r1, r1, r6
+0:
+
+ // We know one exponent is 0, so check if the other is too. We do this by
+ // adding the two exponents together, achieving two things in one
+ // instruction: it gets the nonzero exponent (if any) into r2 (saving us
+ // swapping r2 with r3 in the sorting step above), and it sets Z if both were
+ // zero.
+ adds r2, r2, r3
+ beq LOCAL_LABEL(sub_denorm2)
+
+ // Now exactly one operand is denormal, and it's y. We must go back to
+ // fsub_dosub with all the registers appropriately set up.
+ lsrs r6, r0, #23 // r6 == sign and exponent of x
+ lsls r4, r1, #8 // r4 == mantissa of y, with leading bit clear
+ lsls r0, r0, #8
+ bics r0, r0, r5 // clear high bit on mantissa of x
+ subs r3, r2, #1 // denormals are shifted as if they had exponent 1
+ b LOCAL_LABEL(sub_dosub)
+
+LOCAL_LABEL(sub_denorm2):
+ // Here, x,y are both denormal, and we know we're doing magnitude addition.
+ // So we can subtract the mantissas like ordinary integers. But we have to
+ // avoid subtracting y's sign bit from x's.
+ bics r1, r1, r5 // clear sign bit of y
+ subs r0, r0, r1 // subtract mantissas
+ pop {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__aeabi_fsub)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/Unit/addsf3_test.c b/compiler-rt/test/builtins/Unit/addsf3_test.c
new file mode 100644
index 0000000000000..a08ba8b91056a
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/addsf3_test.c
@@ -0,0 +1,384 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_addsf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm assembler FP implementations, which commit to a
+// more detailed handling of NaNs, we tighten up the check and include some
+// extra test cases specific to that NaN policy.
+#if __arm__ || __thumb__
+# define EXPECT_EXACT_RESULTS
+# define ARM_NAN_HANDLING
+#endif
+
+// Returns: a + b
+COMPILER_RT_ABI float __addsf3(float a, float b);
+
+int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+ float a = fromRep32(a_rep), b = fromRep32(b_rep);
+ float x = __addsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+ int ret = toRep32(x) != expected_rep;
+#else
+ int ret = compareResultF(x, expected_rep);
+#endif
+
+ if (ret) {
+ printf("error at line %d: __addsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+ ", expected %08" PRIx32 "\n",
+ line, a_rep, b_rep, toRep32(x), expected_rep);
+ }
+ return ret;
+}
+
+#define test__addsf3(a,b,x) (test__addsf3)(__LINE__,a,b,x)
+
+int main() {
+ int status = 0;
+
+ status |= test__addsf3(0x00000000, 0x00000000, 0x00000000);
+ status |= test__addsf3(0x00000000, 0x007fffff, 0x007fffff);
+ status |= test__addsf3(0x00000000, 0x3f800000, 0x3f800000);
+ status |= test__addsf3(0x00000000, 0x7f000000, 0x7f000000);
+ status |= test__addsf3(0x00000000, 0x7f800000, 0x7f800000);
+ status |= test__addsf3(0x00000000, 0x80000000, 0x00000000);
+ status |= test__addsf3(0x00000000, 0x807fffff, 0x807fffff);
+ status |= test__addsf3(0x00000000, 0x80800000, 0x80800000);
+ status |= test__addsf3(0x00000000, 0xff800000, 0xff800000);
+ status |= test__addsf3(0x00000001, 0x00000001, 0x00000002);
+ status |= test__addsf3(0x00000001, 0x3f7fffff, 0x3f7fffff);
+ status |= test__addsf3(0x00000001, 0x3f800000, 0x3f800000);
+ status |= test__addsf3(0x00000001, 0x3ffffffe, 0x3ffffffe);
+ status |= test__addsf3(0x00000001, 0x3fffffff, 0x3fffffff);
+ status |= test__addsf3(0x00000001, 0x7effffff, 0x7effffff);
+ status |= test__addsf3(0x00000001, 0x7f000000, 0x7f000000);
+ status |= test__addsf3(0x00000001, 0x7f7ffffe, 0x7f7ffffe);
+ status |= test__addsf3(0x00000001, 0x7f7fffff, 0x7f7fffff);
+ status |= test__addsf3(0x00000001, 0x80000001, 0x00000000);
+ status |= test__addsf3(0x00000002, 0x80000001, 0x00000001);
+ status |= test__addsf3(0x00000003, 0x00000000, 0x00000003);
+ status |= test__addsf3(0x00000003, 0x7f800000, 0x7f800000);
+ status |= test__addsf3(0x00000003, 0x80000000, 0x00000003);
+ status |= test__addsf3(0x00000003, 0x80000002, 0x00000001);
+ status |= test__addsf3(0x00000003, 0xc0a00000, 0xc0a00000);
+ status |= test__addsf3(0x00000003, 0xff000000, 0xff000000);
+ status |= test__addsf3(0x00000003, 0xff800000, 0xff800000);
+ status |= test__addsf3(0x00000004, 0x00000004, 0x00000008);
+ status |= test__addsf3(0x007ffffc, 0x807ffffc, 0x00000000);
+ status |= test__addsf3(0x007ffffd, 0x807ffffe, 0x80000001);
+ status |= test__addsf3(0x007fffff, 0x007fffff, 0x00fffffe);
+ status |= test__addsf3(0x007fffff, 0x807ffffe, 0x00000001);
+ status |= test__addsf3(0x007fffff, 0x80800000, 0x80000001);
+ status |= test__addsf3(0x00800000, 0x00000000, 0x00800000);
+ status |= test__addsf3(0x00800000, 0x00800000, 0x01000000);
+ status |= test__addsf3(0x00800000, 0x80800000, 0x00000000);
+ status |= test__addsf3(0x00800001, 0x80800000, 0x00000001);
+ status |= test__addsf3(0x00800001, 0x80800002, 0x80000001);
+ status |= test__addsf3(0x00ffffff, 0x81000000, 0x80000001);
+ status |= test__addsf3(0x00ffffff, 0x81000002, 0x80000005);
+ status |= test__addsf3(0x00ffffff, 0x81000004, 0x80000009);
+ status |= test__addsf3(0x01000000, 0x80ffffff, 0x00000001);
+ status |= test__addsf3(0x01000001, 0x80800001, 0x00800001);
+ status |= test__addsf3(0x01000001, 0x80ffffff, 0x00000003);
+ status |= test__addsf3(0x01000002, 0x80800001, 0x00800003);
+ status |= test__addsf3(0x017fffff, 0x81800000, 0x80000002);
+ status |= test__addsf3(0x01800000, 0x817fffff, 0x00000002);
+ status |= test__addsf3(0x01800001, 0x817fffff, 0x00000006);
+ status |= test__addsf3(0x01800002, 0x81000003, 0x01000001);
+ status |= test__addsf3(0x3f7fffff, 0x80000001, 0x3f7fffff);
+ status |= test__addsf3(0x3f800000, 0x3f800000, 0x40000000);
+ status |= test__addsf3(0x3f800000, 0x3f800003, 0x40000002);
+ status |= test__addsf3(0x3f800000, 0x40000000, 0x40400000);
+ status |= test__addsf3(0x3f800000, 0x40e00000, 0x41000000);
+ status |= test__addsf3(0x3f800000, 0x80000000, 0x3f800000);
+ status |= test__addsf3(0x3f800000, 0xbf800000, 0x00000000);
+ status |= test__addsf3(0x3f800001, 0x3f800000, 0x40000000);
+ status |= test__addsf3(0x3f800001, 0xbf800000, 0x34000000);
+ status |= test__addsf3(0x3f800001, 0xbf800002, 0xb4000000);
+ status |= test__addsf3(0x3ffffffc, 0xbffffffd, 0xb4000000);
+ status |= test__addsf3(0x3fffffff, 0xc0000000, 0xb4000000);
+ status |= test__addsf3(0x40000000, 0x34000000, 0x40000000);
+ status |= test__addsf3(0x40000000, 0x3f800000, 0x40400000);
+ status |= test__addsf3(0x40000000, 0x40000000, 0x40800000);
+ status |= test__addsf3(0x40000000, 0x40000001, 0x40800000);
+ status |= test__addsf3(0x40000000, 0xbfffffff, 0x34000000);
+ status |= test__addsf3(0x40000000, 0xc0000000, 0x00000000);
+ status |= test__addsf3(0x40000000, 0xc0000001, 0xb4800000);
+ status |= test__addsf3(0x40000000, 0xc0a00000, 0xc0400000);
+ status |= test__addsf3(0x40000001, 0x34000000, 0x40000002);
+ status |= test__addsf3(0x40000001, 0x40000002, 0x40800002);
+ status |= test__addsf3(0x40000001, 0xbf800001, 0x3f800001);
+ status |= test__addsf3(0x40000002, 0xbf800001, 0x3f800003);
+ status |= test__addsf3(0x40000002, 0xbf800003, 0x3f800001);
+ status |= test__addsf3(0x40000004, 0xc0000003, 0x34800000);
+ status |= test__addsf3(0x40400000, 0x40400000, 0x40c00000);
+ status |= test__addsf3(0x407fffff, 0x33ffffff, 0x407fffff);
+ status |= test__addsf3(0x407fffff, 0x34000000, 0x40800000);
+ status |= test__addsf3(0x407fffff, 0xc07ffffe, 0x34800000);
+ status |= test__addsf3(0x407fffff, 0xc0800002, 0xb5a00000);
+ status |= test__addsf3(0x40800001, 0xc07fffff, 0x35400000);
+ status |= test__addsf3(0x40a00000, 0x00000000, 0x40a00000);
+ status |= test__addsf3(0x40a00000, 0x80000000, 0x40a00000);
+ status |= test__addsf3(0x40a00000, 0xbf800000, 0x40800000);
+ status |= test__addsf3(0x40a00000, 0xc0a00000, 0x00000000);
+ status |= test__addsf3(0x7d800001, 0xfd7fffff, 0x72400000);
+ status |= test__addsf3(0x7e7fffff, 0xfe7ffffe, 0x72800000);
+ status |= test__addsf3(0x7e7fffff, 0xfe800002, 0xf3a00000);
+ status |= test__addsf3(0x7e800000, 0x7e800000, 0x7f000000);
+ status |= test__addsf3(0x7e800000, 0xfe7fffff, 0x72800000);
+ status |= test__addsf3(0x7e800000, 0xfe800001, 0xf3000000);
+ status |= test__addsf3(0x7e800001, 0x7e800000, 0x7f000000);
+ status |= test__addsf3(0x7e800001, 0xff000001, 0xfe800001);
+ status |= test__addsf3(0x7e800002, 0xfe000003, 0x7e000001);
+ status |= test__addsf3(0x7e800004, 0xfe800003, 0x73000000);
+ status |= test__addsf3(0x7efffffe, 0x7efffffe, 0x7f7ffffe);
+ status |= test__addsf3(0x7efffffe, 0x7effffff, 0x7f7ffffe);
+ status |= test__addsf3(0x7effffff, 0x3f800000, 0x7effffff);
+ status |= test__addsf3(0x7effffff, 0x7f000000, 0x7f800000);
+ status |= test__addsf3(0x7effffff, 0xbf800000, 0x7effffff);
+ status |= test__addsf3(0x7effffff, 0xff000000, 0xf3000000);
+ status |= test__addsf3(0x7f000000, 0x3f800000, 0x7f000000);
+ status |= test__addsf3(0x7f000000, 0x7f000000, 0x7f800000);
+ status |= test__addsf3(0x7f000000, 0x7f800000, 0x7f800000);
+ status |= test__addsf3(0x7f000000, 0xbf800000, 0x7f000000);
+ status |= test__addsf3(0x7f000000, 0xff000000, 0x00000000);
+ status |= test__addsf3(0x7f000000, 0xff800000, 0xff800000);
+ status |= test__addsf3(0x7f000001, 0x7f000000, 0x7f800000);
+ status |= test__addsf3(0x7f000001, 0xff000000, 0x73800000);
+ status |= test__addsf3(0x7f000001, 0xff000002, 0xf3800000);
+ status |= test__addsf3(0x7f000002, 0xfe800001, 0x7e800003);
+ status |= test__addsf3(0x7f7ffffe, 0x3f800000, 0x7f7ffffe);
+ status |= test__addsf3(0x7f7ffffe, 0x7f7ffffe, 0x7f800000);
+ status |= test__addsf3(0x7f7ffffe, 0x7f7fffff, 0x7f800000);
+ status |= test__addsf3(0x7f7ffffe, 0xbf800000, 0x7f7ffffe);
+ status |= test__addsf3(0x7f7ffffe, 0xff7fffff, 0xf3800000);
+ status |= test__addsf3(0x7f7fffff, 0x3f800000, 0x7f7fffff);
+ status |= test__addsf3(0x7f7fffff, 0x80000001, 0x7f7fffff);
+ status |= test__addsf3(0x7f7fffff, 0xbf800000, 0x7f7fffff);
+ status |= test__addsf3(0x7f7fffff, 0xff7fffff, 0x00000000);
+ status |= test__addsf3(0x7f800000, 0x00000000, 0x7f800000);
+ status |= test__addsf3(0x7f800000, 0x007fffff, 0x7f800000);
+ status |= test__addsf3(0x7f800000, 0x7f000000, 0x7f800000);
+ status |= test__addsf3(0x7f800000, 0x7f800000, 0x7f800000);
+ status |= test__addsf3(0x7f800000, 0x80000000, 0x7f800000);
+ status |= test__addsf3(0x7f800000, 0x807fffff, 0x7f800000);
+ status |= test__addsf3(0x7f800000, 0xff000000, 0x7f800000);
+ status |= test__addsf3(0x80000000, 0x00000000, 0x00000000);
+ status |= test__addsf3(0x80000000, 0x007fffff, 0x007fffff);
+ status |= test__addsf3(0x80000000, 0x7f000000, 0x7f000000);
+ status |= test__addsf3(0x80000000, 0x7f800000, 0x7f800000);
+ status |= test__addsf3(0x80000000, 0x80000000, 0x80000000);
+ status |= test__addsf3(0x80000000, 0x807fffff, 0x807fffff);
+ status |= test__addsf3(0x80000000, 0x80800000, 0x80800000);
+ status |= test__addsf3(0x80000000, 0xbf800000, 0xbf800000);
+ status |= test__addsf3(0x80000000, 0xff800000, 0xff800000);
+ status |= test__addsf3(0x80000001, 0x00000001, 0x00000000);
+ status |= test__addsf3(0x80000001, 0x80000001, 0x80000002);
+ status |= test__addsf3(0x80000001, 0xbf7fffff, 0xbf7fffff);
+ status |= test__addsf3(0x80000001, 0xbf800000, 0xbf800000);
+ status |= test__addsf3(0x80000001, 0xbffffffe, 0xbffffffe);
+ status |= test__addsf3(0x80000001, 0xbfffffff, 0xbfffffff);
+ status |= test__addsf3(0x80000001, 0xfeffffff, 0xfeffffff);
+ status |= test__addsf3(0x80000001, 0xff000000, 0xff000000);
+ status |= test__addsf3(0x80000001, 0xff7ffffe, 0xff7ffffe);
+ status |= test__addsf3(0x80000001, 0xff7fffff, 0xff7fffff);
+ status |= test__addsf3(0x80000002, 0x00000001, 0x80000001);
+ status |= test__addsf3(0x80000003, 0x00000000, 0x80000003);
+ status |= test__addsf3(0x80000003, 0x00000002, 0x80000001);
+ status |= test__addsf3(0x80000003, 0x40400000, 0x40400000);
+ status |= test__addsf3(0x80000003, 0x7f000000, 0x7f000000);
+ status |= test__addsf3(0x80000003, 0x7f800000, 0x7f800000);
+ status |= test__addsf3(0x80000003, 0x80000000, 0x80000003);
+ status |= test__addsf3(0x80000003, 0xff800000, 0xff800000);
+ status |= test__addsf3(0x80000004, 0x80000004, 0x80000008);
+ status |= test__addsf3(0x807ffffd, 0x007ffffe, 0x00000001);
+ status |= test__addsf3(0x807fffff, 0x007ffffe, 0x80000001);
+ status |= test__addsf3(0x807fffff, 0x007fffff, 0x00000000);
+ status |= test__addsf3(0x807fffff, 0x00800000, 0x00000001);
+ status |= test__addsf3(0x807fffff, 0x807fffff, 0x80fffffe);
+ status |= test__addsf3(0x80800000, 0x00000000, 0x80800000);
+ status |= test__addsf3(0x80800000, 0x00800000, 0x00000000);
+ status |= test__addsf3(0x80800001, 0x00800000, 0x80000001);
+ status |= test__addsf3(0x80800001, 0x00800002, 0x00000001);
+ status |= test__addsf3(0x80ffffff, 0x01000000, 0x00000001);
+ status |= test__addsf3(0x80ffffff, 0x01000002, 0x00000005);
+ status |= test__addsf3(0x80ffffff, 0x01000004, 0x00000009);
+ status |= test__addsf3(0x81000000, 0x00ffffff, 0x80000001);
+ status |= test__addsf3(0x81000001, 0x00800001, 0x80800001);
+ status |= test__addsf3(0x81000001, 0x00ffffff, 0x80000003);
+ status |= test__addsf3(0x81000002, 0x00800001, 0x80800003);
+ status |= test__addsf3(0x817fffff, 0x01800000, 0x00000002);
+ status |= test__addsf3(0x81800000, 0x017fffff, 0x80000002);
+ status |= test__addsf3(0x81800001, 0x017fffff, 0x80000006);
+ status |= test__addsf3(0x81800002, 0x01000003, 0x81000001);
+ status |= test__addsf3(0xbf800000, 0x80000000, 0xbf800000);
+ status |= test__addsf3(0xbf800000, 0xbf800003, 0xc0000002);
+ status |= test__addsf3(0xbf800001, 0x3f800000, 0xb4000000);
+ status |= test__addsf3(0xbf800001, 0x3f800002, 0x34000000);
+ status |= test__addsf3(0xbf800001, 0xbf800000, 0xc0000000);
+ status |= test__addsf3(0xbffffffc, 0x3ffffffd, 0x34000000);
+ status |= test__addsf3(0xbfffffff, 0x00000001, 0xbfffffff);
+ status |= test__addsf3(0xbfffffff, 0x40000000, 0x34000000);
+ status |= test__addsf3(0xc0000000, 0x3fffffff, 0xb4000000);
+ status |= test__addsf3(0xc0000000, 0x40000001, 0x34800000);
+ status |= test__addsf3(0xc0000000, 0xc0000001, 0xc0800000);
+ status |= test__addsf3(0xc0000001, 0x3f800001, 0xbf800001);
+ status |= test__addsf3(0xc0000001, 0xc0000002, 0xc0800002);
+ status |= test__addsf3(0xc0000002, 0x3f800001, 0xbf800003);
+ status |= test__addsf3(0xc0000002, 0x3f800003, 0xbf800001);
+ status |= test__addsf3(0xc0000004, 0x40000003, 0xb4800000);
+ status |= test__addsf3(0xc0400000, 0x40400000, 0x00000000);
+ status |= test__addsf3(0xc07fffff, 0x407ffffe, 0xb4800000);
+ status |= test__addsf3(0xc07fffff, 0x40800002, 0x35a00000);
+ status |= test__addsf3(0xc07fffff, 0xb3ffffff, 0xc07fffff);
+ status |= test__addsf3(0xc07fffff, 0xb4000000, 0xc0800000);
+ status |= test__addsf3(0xc0800001, 0x407fffff, 0xb5400000);
+ status |= test__addsf3(0xfd800001, 0x7d7fffff, 0xf2400000);
+ status |= test__addsf3(0xfe7fffff, 0x7e7ffffe, 0xf2800000);
+ status |= test__addsf3(0xfe7fffff, 0x7e800002, 0x73a00000);
+ status |= test__addsf3(0xfe800000, 0x7e7fffff, 0xf2800000);
+ status |= test__addsf3(0xfe800000, 0x7e800001, 0x73000000);
+ status |= test__addsf3(0xfe800001, 0x7f000001, 0x7e800001);
+ status |= test__addsf3(0xfe800001, 0xfe800000, 0xff000000);
+ status |= test__addsf3(0xfe800002, 0x7e000003, 0xfe000001);
+ status |= test__addsf3(0xfe800004, 0x7e800003, 0xf3000000);
+ status |= test__addsf3(0xfefffffe, 0x7efffffe, 0x00000000);
+ status |= test__addsf3(0xfefffffe, 0xfefffffe, 0xff7ffffe);
+ status |= test__addsf3(0xfefffffe, 0xfeffffff, 0xff7ffffe);
+ status |= test__addsf3(0xfeffffff, 0x3f800000, 0xfeffffff);
+ status |= test__addsf3(0xfeffffff, 0x7f000000, 0x73000000);
+ status |= test__addsf3(0xfeffffff, 0xbf800000, 0xfeffffff);
+ status |= test__addsf3(0xfeffffff, 0xff000000, 0xff800000);
+ status |= test__addsf3(0xff000000, 0x00000000, 0xff000000);
+ status |= test__addsf3(0xff000000, 0x3f800000, 0xff000000);
+ status |= test__addsf3(0xff000000, 0x7f800000, 0x7f800000);
+ status |= test__addsf3(0xff000000, 0x80000000, 0xff000000);
+ status |= test__addsf3(0xff000000, 0xbf800000, 0xff000000);
+ status |= test__addsf3(0xff000000, 0xff000000, 0xff800000);
+ status |= test__addsf3(0xff000000, 0xff800000, 0xff800000);
+ status |= test__addsf3(0xff000001, 0x7f000000, 0xf3800000);
+ status |= test__addsf3(0xff000001, 0x7f000002, 0x73800000);
+ status |= test__addsf3(0xff000001, 0xff000000, 0xff800000);
+ status |= test__addsf3(0xff000002, 0x7e800001, 0xfe800003);
+ status |= test__addsf3(0xff7ffffe, 0x3f800000, 0xff7ffffe);
+ status |= test__addsf3(0xff7ffffe, 0x7f7fffff, 0x73800000);
+ status |= test__addsf3(0xff7ffffe, 0xbf800000, 0xff7ffffe);
+ status |= test__addsf3(0xff7ffffe, 0xff7ffffe, 0xff800000);
+ status |= test__addsf3(0xff7ffffe, 0xff7fffff, 0xff800000);
+ status |= test__addsf3(0xff7fffff, 0x00000001, 0xff7fffff);
+ status |= test__addsf3(0xff7fffff, 0x3f800000, 0xff7fffff);
+ status |= test__addsf3(0xff7fffff, 0xbf800000, 0xff7fffff);
+ status |= test__addsf3(0xff800000, 0x00000000, 0xff800000);
+ status |= test__addsf3(0xff800000, 0x007fffff, 0xff800000);
+ status |= test__addsf3(0xff800000, 0x7f000000, 0xff800000);
+ status |= test__addsf3(0xff800000, 0x80000000, 0xff800000);
+ status |= test__addsf3(0xff800000, 0x807fffff, 0xff800000);
+ status |= test__addsf3(0xff800000, 0xff000000, 0xff800000);
+ status |= test__addsf3(0xff800000, 0xff800000, 0xff800000);
+ status |= test__addsf3(0x7f7fffff, 0x74ffffff, 0x7f800000);
+ status |= test__addsf3(0x3f7fffff, 0x34004000, 0x3f800001);
+ status |= test__addsf3(0x3f800001, 0x23800000, 0x3f800001);
+ status |= test__addsf3(0xbbebe66d, 0x3b267c1f, 0xbb98a85e);
+ status |= test__addsf3(0x01f5b166, 0x81339a37, 0x019be44a);
+
+ // Test that the result of an operation is a NaN at all when it should be.
+ //
+ // In most configurations these tests' results are checked compared using
+ // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+ // which causes compareResultF to accept any NaN encoding. We also use the
+ // same value as the input NaN in tests that have one, so that even in
+ // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+ // still the exact expected NaN.
+ status |= test__addsf3(0x7f800000, 0xff800000, 0x7fc00000);
+ status |= test__addsf3(0xff800000, 0x7f800000, 0x7fc00000);
+ status |= test__addsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+ status |= test__addsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+ status |= test__addsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+ // Tests specific to the NaN handling of Arm hardware, mimicked by
+ // arm/addsf3.S:
+ //
+ // - a quiet NaN is distinguished by the top mantissa bit being 1
+ //
+ // - if a signalling NaN appears in the input, the output quiet NaN is
+ // obtained by setting its top mantissa bit and leaving everything else
+ // unchanged
+ //
+ // - if both operands are signalling NaNs then the output NaN is derived
+ // from the first operand
+ //
+ // - if both operands are quiet NaNs then the output NaN is the first
+ // operand
+ //
+ // - invalid operations not involving an input NaN return the quiet
+ // NaN with fewest bits set, 0x7fc00000.
+
+ status |= test__addsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+ status |= test__addsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+ status |= test__addsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+ status |= test__addsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+ status |= test__addsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+ status |= test__addsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+ status |= test__addsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+ status |= test__addsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+ status |= test__addsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+ status |= test__addsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+ status |= test__addsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+ status |= test__addsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+ status |= test__addsf3(0x7f800000, 0xff800000, 0x7fc00000);
+ status |= test__addsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+ status |= test__addsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+ status |= test__addsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+ status |= test__addsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+ status |= test__addsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+ status |= test__addsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+ status |= test__addsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+ status |= test__addsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+ status |= test__addsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+ status |= test__addsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+ status |= test__addsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+ status |= test__addsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+ status |= test__addsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+ status |= test__addsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+ status |= test__addsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+ status |= test__addsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+ status |= test__addsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+ status |= test__addsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+ status |= test__addsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+ status |= test__addsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+ status |= test__addsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+ status |= test__addsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+ status |= test__addsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+ status |= test__addsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+ status |= test__addsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+ status |= test__addsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+ status |= test__addsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+ status |= test__addsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+ status |= test__addsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+ status |= test__addsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+ status |= test__addsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+ status |= test__addsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+ status |= test__addsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+ status |= test__addsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+ status |= test__addsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+ status |= test__addsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+ status |= test__addsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+ status |= test__addsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+ status |= test__addsf3(0xff800000, 0x7f800000, 0x7fc00000);
+ status |= test__addsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+ status |= test__addsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+#endif // ARM_NAN_HANDLING
+
+ return status;
+}
diff --git a/compiler-rt/test/builtins/Unit/subsf3_test.c b/compiler-rt/test/builtins/Unit/subsf3_test.c
new file mode 100644
index 0000000000000..b9c1b2ac4362a
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/subsf3_test.c
@@ -0,0 +1,382 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_addsf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm assembler FP implementations, which commit to a
+// more detailed handling of NaNs, we tighten up the check and include some
+// extra test cases specific to that NaN policy.
+#if __arm__ || __thumb__
+# define EXPECT_EXACT_RESULTS
+# define ARM_NAN_HANDLING
+#endif
+
+// Returns: a - b
+COMPILER_RT_ABI float __subsf3(float a, float b);
+
+int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+ float a = fromRep32(a_rep), b = fromRep32(b_rep);
+ float x = __subsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+ int ret = toRep32(x) != expected_rep;
+#else
+ int ret = compareResultF(x, expected_rep);
+#endif
+
+ if (ret) {
+ printf("error at line %d: __subsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+ ", expected %08" PRIx32 "\n",
+ line, a_rep, b_rep, toRep32(x), expected_rep);
+ }
+ return ret;
+}
+
+#define test__subsf3(a,b,x) test__subsf3(__LINE__,a,b,x)
+
+int main() {
+ int status = 0;
+
+ status |= test__subsf3(0x00000000, 0x00000000, 0x00000000);
+ status |= test__subsf3(0x00000000, 0x007fffff, 0x807fffff);
+ status |= test__subsf3(0x00000000, 0x00800000, 0x80800000);
+ status |= test__subsf3(0x00000000, 0x7f800000, 0xff800000);
+ status |= test__subsf3(0x00000000, 0x80000000, 0x00000000);
+ status |= test__subsf3(0x00000000, 0x807fffff, 0x007fffff);
+ status |= test__subsf3(0x00000000, 0xbf800000, 0x3f800000);
+ status |= test__subsf3(0x00000000, 0xff000000, 0x7f000000);
+ status |= test__subsf3(0x00000000, 0xff800000, 0x7f800000);
+ status |= test__subsf3(0x00000001, 0x00000001, 0x00000000);
+ status |= test__subsf3(0x00000001, 0x80000001, 0x00000002);
+ status |= test__subsf3(0x00000001, 0xbf7fffff, 0x3f7fffff);
+ status |= test__subsf3(0x00000001, 0xbf800000, 0x3f800000);
+ status |= test__subsf3(0x00000001, 0xbffffffe, 0x3ffffffe);
+ status |= test__subsf3(0x00000001, 0xbfffffff, 0x3fffffff);
+ status |= test__subsf3(0x00000001, 0xfeffffff, 0x7effffff);
+ status |= test__subsf3(0x00000001, 0xff000000, 0x7f000000);
+ status |= test__subsf3(0x00000001, 0xff7ffffe, 0x7f7ffffe);
+ status |= test__subsf3(0x00000001, 0xff7fffff, 0x7f7fffff);
+ status |= test__subsf3(0x00000002, 0x00000001, 0x00000001);
+ status |= test__subsf3(0x00000003, 0x00000000, 0x00000003);
+ status |= test__subsf3(0x00000003, 0x00000002, 0x00000001);
+ status |= test__subsf3(0x00000003, 0x40a00000, 0xc0a00000);
+ status |= test__subsf3(0x00000003, 0x7f000000, 0xff000000);
+ status |= test__subsf3(0x00000003, 0x7f800000, 0xff800000);
+ status |= test__subsf3(0x00000003, 0x80000000, 0x00000003);
+ status |= test__subsf3(0x00000003, 0xff800000, 0x7f800000);
+ status |= test__subsf3(0x00000004, 0x80000004, 0x00000008);
+ status |= test__subsf3(0x007ffffc, 0x007ffffc, 0x00000000);
+ status |= test__subsf3(0x007ffffd, 0x007ffffe, 0x80000001);
+ status |= test__subsf3(0x007fffff, 0x007ffffe, 0x00000001);
+ status |= test__subsf3(0x007fffff, 0x00800000, 0x80000001);
+ status |= test__subsf3(0x007fffff, 0x807fffff, 0x00fffffe);
+ status |= test__subsf3(0x00800000, 0x00800000, 0x00000000);
+ status |= test__subsf3(0x00800000, 0x80000000, 0x00800000);
+ status |= test__subsf3(0x00800000, 0x80800000, 0x01000000);
+ status |= test__subsf3(0x00800001, 0x00800000, 0x00000001);
+ status |= test__subsf3(0x00800001, 0x00800002, 0x80000001);
+ status |= test__subsf3(0x00ffffff, 0x01000000, 0x80000001);
+ status |= test__subsf3(0x00ffffff, 0x01000002, 0x80000005);
+ status |= test__subsf3(0x00ffffff, 0x01000004, 0x80000009);
+ status |= test__subsf3(0x01000000, 0x00ffffff, 0x00000001);
+ status |= test__subsf3(0x01000001, 0x00800001, 0x00800001);
+ status |= test__subsf3(0x01000001, 0x00ffffff, 0x00000003);
+ status |= test__subsf3(0x01000002, 0x00800001, 0x00800003);
+ status |= test__subsf3(0x017fffff, 0x01800000, 0x80000002);
+ status |= test__subsf3(0x01800000, 0x017fffff, 0x00000002);
+ status |= test__subsf3(0x01800001, 0x017fffff, 0x00000006);
+ status |= test__subsf3(0x01800002, 0x01000003, 0x01000001);
+ status |= test__subsf3(0x3f7fffff, 0x00000001, 0x3f7fffff);
+ status |= test__subsf3(0x3f800000, 0x00000000, 0x3f800000);
+ status |= test__subsf3(0x3f800000, 0x3f800000, 0x00000000);
+ status |= test__subsf3(0x3f800000, 0xbf800000, 0x40000000);
+ status |= test__subsf3(0x3f800000, 0xbf800003, 0x40000002);
+ status |= test__subsf3(0x3f800000, 0xc0000000, 0x40400000);
+ status |= test__subsf3(0x3f800000, 0xc0e00000, 0x41000000);
+ status |= test__subsf3(0x3f800001, 0x3f800000, 0x34000000);
+ status |= test__subsf3(0x3f800001, 0x3f800002, 0xb4000000);
+ status |= test__subsf3(0x3f800001, 0xbf800000, 0x40000000);
+ status |= test__subsf3(0x3ffffffc, 0x3ffffffd, 0xb4000000);
+ status |= test__subsf3(0x3fffffff, 0x40000000, 0xb4000000);
+ status |= test__subsf3(0x40000000, 0x3fffffff, 0x34000000);
+ status |= test__subsf3(0x40000000, 0x40000000, 0x00000000);
+ status |= test__subsf3(0x40000000, 0x40000001, 0xb4800000);
+ status |= test__subsf3(0x40000000, 0x40a00000, 0xc0400000);
+ status |= test__subsf3(0x40000000, 0xb4000000, 0x40000000);
+ status |= test__subsf3(0x40000000, 0xbf800000, 0x40400000);
+ status |= test__subsf3(0x40000000, 0xc0000000, 0x40800000);
+ status |= test__subsf3(0x40000000, 0xc0000001, 0x40800000);
+ status |= test__subsf3(0x40000001, 0x3f800001, 0x3f800001);
+ status |= test__subsf3(0x40000001, 0xb4000000, 0x40000002);
+ status |= test__subsf3(0x40000001, 0xc0000002, 0x40800002);
+ status |= test__subsf3(0x40000002, 0x3f800001, 0x3f800003);
+ status |= test__subsf3(0x40000002, 0x3f800003, 0x3f800001);
+ status |= test__subsf3(0x40000004, 0x40000003, 0x34800000);
+ status |= test__subsf3(0x40400000, 0xc0400000, 0x40c00000);
+ status |= test__subsf3(0x407fffff, 0x407ffffe, 0x34800000);
+ status |= test__subsf3(0x407fffff, 0x40800002, 0xb5a00000);
+ status |= test__subsf3(0x407fffff, 0xb3ffffff, 0x407fffff);
+ status |= test__subsf3(0x407fffff, 0xb4000000, 0x40800000);
+ status |= test__subsf3(0x40800001, 0x407fffff, 0x35400000);
+ status |= test__subsf3(0x40a00000, 0x00000000, 0x40a00000);
+ status |= test__subsf3(0x40a00000, 0x3f800000, 0x40800000);
+ status |= test__subsf3(0x40a00000, 0x40a00000, 0x00000000);
+ status |= test__subsf3(0x40a00000, 0x80000000, 0x40a00000);
+ status |= test__subsf3(0x7d800001, 0x7d7fffff, 0x72400000);
+ status |= test__subsf3(0x7e7fffff, 0x7e7ffffe, 0x72800000);
+ status |= test__subsf3(0x7e7fffff, 0x7e800002, 0xf3a00000);
+ status |= test__subsf3(0x7e800000, 0x7e7fffff, 0x72800000);
+ status |= test__subsf3(0x7e800000, 0x7e800001, 0xf3000000);
+ status |= test__subsf3(0x7e800000, 0xfe800000, 0x7f000000);
+ status |= test__subsf3(0x7e800001, 0x7f000001, 0xfe800001);
+ status |= test__subsf3(0x7e800001, 0xfe800000, 0x7f000000);
+ status |= test__subsf3(0x7e800002, 0x7e000003, 0x7e000001);
+ status |= test__subsf3(0x7e800004, 0x7e800003, 0x73000000);
+ status |= test__subsf3(0x7efffffe, 0xfefffffe, 0x7f7ffffe);
+ status |= test__subsf3(0x7efffffe, 0xfeffffff, 0x7f7ffffe);
+ status |= test__subsf3(0x7effffff, 0x3f800000, 0x7effffff);
+ status |= test__subsf3(0x7effffff, 0x7f000000, 0xf3000000);
+ status |= test__subsf3(0x7effffff, 0xbf800000, 0x7effffff);
+ status |= test__subsf3(0x7effffff, 0xff000000, 0x7f800000);
+ status |= test__subsf3(0x7f000000, 0x3f800000, 0x7f000000);
+ status |= test__subsf3(0x7f000000, 0x7f000000, 0x00000000);
+ status |= test__subsf3(0x7f000000, 0x7f800000, 0xff800000);
+ status |= test__subsf3(0x7f000000, 0xbf800000, 0x7f000000);
+ status |= test__subsf3(0x7f000000, 0xff000000, 0x7f800000);
+ status |= test__subsf3(0x7f000000, 0xff800000, 0x7f800000);
+ status |= test__subsf3(0x7f000001, 0x7f000000, 0x73800000);
+ status |= test__subsf3(0x7f000001, 0x7f000002, 0xf3800000);
+ status |= test__subsf3(0x7f000001, 0xff000000, 0x7f800000);
+ status |= test__subsf3(0x7f000002, 0x7e800001, 0x7e800003);
+ status |= test__subsf3(0x7f7ffffe, 0x3f800000, 0x7f7ffffe);
+ status |= test__subsf3(0x7f7ffffe, 0x7f7fffff, 0xf3800000);
+ status |= test__subsf3(0x7f7ffffe, 0xbf800000, 0x7f7ffffe);
+ status |= test__subsf3(0x7f7ffffe, 0xff7ffffe, 0x7f800000);
+ status |= test__subsf3(0x7f7ffffe, 0xff7fffff, 0x7f800000);
+ status |= test__subsf3(0x7f7fffff, 0x00000001, 0x7f7fffff);
+ status |= test__subsf3(0x7f7fffff, 0x3f800000, 0x7f7fffff);
+ status |= test__subsf3(0x7f7fffff, 0x7f7fffff, 0x00000000);
+ status |= test__subsf3(0x7f7fffff, 0xbf800000, 0x7f7fffff);
+ status |= test__subsf3(0x7f800000, 0x00000000, 0x7f800000);
+ status |= test__subsf3(0x7f800000, 0x007fffff, 0x7f800000);
+ status |= test__subsf3(0x7f800000, 0x7f000000, 0x7f800000);
+ status |= test__subsf3(0x7f800000, 0x80000000, 0x7f800000);
+ status |= test__subsf3(0x7f800000, 0x807fffff, 0x7f800000);
+ status |= test__subsf3(0x7f800000, 0xff000000, 0x7f800000);
+ status |= test__subsf3(0x7f800000, 0xff800000, 0x7f800000);
+ status |= test__subsf3(0x80000000, 0x00000000, 0x80000000);
+ status |= test__subsf3(0x80000000, 0x007fffff, 0x807fffff);
+ status |= test__subsf3(0x80000000, 0x00800000, 0x80800000);
+ status |= test__subsf3(0x80000000, 0x3f800000, 0xbf800000);
+ status |= test__subsf3(0x80000000, 0x7f800000, 0xff800000);
+ status |= test__subsf3(0x80000000, 0x80000000, 0x00000000);
+ status |= test__subsf3(0x80000000, 0x807fffff, 0x007fffff);
+ status |= test__subsf3(0x80000000, 0xff000000, 0x7f000000);
+ status |= test__subsf3(0x80000000, 0xff800000, 0x7f800000);
+ status |= test__subsf3(0x80000001, 0x00000001, 0x80000002);
+ status |= test__subsf3(0x80000001, 0x3f7fffff, 0xbf7fffff);
+ status |= test__subsf3(0x80000001, 0x3f800000, 0xbf800000);
+ status |= test__subsf3(0x80000001, 0x3ffffffe, 0xbffffffe);
+ status |= test__subsf3(0x80000001, 0x3fffffff, 0xbfffffff);
+ status |= test__subsf3(0x80000001, 0x7effffff, 0xfeffffff);
+ status |= test__subsf3(0x80000001, 0x7f000000, 0xff000000);
+ status |= test__subsf3(0x80000001, 0x7f7ffffe, 0xff7ffffe);
+ status |= test__subsf3(0x80000001, 0x7f7fffff, 0xff7fffff);
+ status |= test__subsf3(0x80000001, 0x80000001, 0x00000000);
+ status |= test__subsf3(0x80000002, 0x80000001, 0x80000001);
+ status |= test__subsf3(0x80000003, 0x00000000, 0x80000003);
+ status |= test__subsf3(0x80000003, 0x7f800000, 0xff800000);
+ status |= test__subsf3(0x80000003, 0x80000000, 0x80000003);
+ status |= test__subsf3(0x80000003, 0x80000002, 0x80000001);
+ status |= test__subsf3(0x80000003, 0xc0400000, 0x40400000);
+ status |= test__subsf3(0x80000003, 0xff000000, 0x7f000000);
+ status |= test__subsf3(0x80000003, 0xff800000, 0x7f800000);
+ status |= test__subsf3(0x80000004, 0x00000004, 0x80000008);
+ status |= test__subsf3(0x807ffffd, 0x807ffffe, 0x00000001);
+ status |= test__subsf3(0x807fffff, 0x007fffff, 0x80fffffe);
+ status |= test__subsf3(0x807fffff, 0x807ffffe, 0x80000001);
+ status |= test__subsf3(0x807fffff, 0x807fffff, 0x00000000);
+ status |= test__subsf3(0x807fffff, 0x80800000, 0x00000001);
+ status |= test__subsf3(0x80800000, 0x80000000, 0x80800000);
+ status |= test__subsf3(0x80800000, 0x80800000, 0x00000000);
+ status |= test__subsf3(0x80800001, 0x80800000, 0x80000001);
+ status |= test__subsf3(0x80800001, 0x80800002, 0x00000001);
+ status |= test__subsf3(0x80ffffff, 0x81000000, 0x00000001);
+ status |= test__subsf3(0x80ffffff, 0x81000002, 0x00000005);
+ status |= test__subsf3(0x80ffffff, 0x81000004, 0x00000009);
+ status |= test__subsf3(0x81000000, 0x80ffffff, 0x80000001);
+ status |= test__subsf3(0x81000001, 0x80800001, 0x80800001);
+ status |= test__subsf3(0x81000001, 0x80ffffff, 0x80000003);
+ status |= test__subsf3(0x81000002, 0x80800001, 0x80800003);
+ status |= test__subsf3(0x817fffff, 0x81800000, 0x00000002);
+ status |= test__subsf3(0x81800000, 0x817fffff, 0x80000002);
+ status |= test__subsf3(0x81800001, 0x817fffff, 0x80000006);
+ status |= test__subsf3(0x81800002, 0x81000003, 0x81000001);
+ status |= test__subsf3(0xbf800000, 0x00000000, 0xbf800000);
+ status |= test__subsf3(0xbf800000, 0x3f800003, 0xc0000002);
+ status |= test__subsf3(0xbf800001, 0x3f800000, 0xc0000000);
+ status |= test__subsf3(0xbf800001, 0xbf800000, 0xb4000000);
+ status |= test__subsf3(0xbf800001, 0xbf800002, 0x34000000);
+ status |= test__subsf3(0xbffffffc, 0xbffffffd, 0x34000000);
+ status |= test__subsf3(0xbfffffff, 0x80000001, 0xbfffffff);
+ status |= test__subsf3(0xbfffffff, 0xc0000000, 0x34000000);
+ status |= test__subsf3(0xc0000000, 0x40000001, 0xc0800000);
+ status |= test__subsf3(0xc0000000, 0xbfffffff, 0xb4000000);
+ status |= test__subsf3(0xc0000000, 0xc0000001, 0x34800000);
+ status |= test__subsf3(0xc0000001, 0x40000002, 0xc0800002);
+ status |= test__subsf3(0xc0000001, 0xbf800001, 0xbf800001);
+ status |= test__subsf3(0xc0000002, 0xbf800001, 0xbf800003);
+ status |= test__subsf3(0xc0000002, 0xbf800003, 0xbf800001);
+ status |= test__subsf3(0xc0000004, 0xc0000003, 0xb4800000);
+ status |= test__subsf3(0xc0400000, 0xc0400000, 0x00000000);
+ status |= test__subsf3(0xc07fffff, 0x33ffffff, 0xc07fffff);
+ status |= test__subsf3(0xc07fffff, 0x34000000, 0xc0800000);
+ status |= test__subsf3(0xc07fffff, 0xc07ffffe, 0xb4800000);
+ status |= test__subsf3(0xc07fffff, 0xc0800002, 0x35a00000);
+ status |= test__subsf3(0xc0800001, 0xc07fffff, 0xb5400000);
+ status |= test__subsf3(0xfd800001, 0xfd7fffff, 0xf2400000);
+ status |= test__subsf3(0xfe7fffff, 0xfe7ffffe, 0xf2800000);
+ status |= test__subsf3(0xfe7fffff, 0xfe800002, 0x73a00000);
+ status |= test__subsf3(0xfe800000, 0xfe7fffff, 0xf2800000);
+ status |= test__subsf3(0xfe800000, 0xfe800001, 0x73000000);
+ status |= test__subsf3(0xfe800001, 0x7e800000, 0xff000000);
+ status |= test__subsf3(0xfe800001, 0xff000001, 0x7e800001);
+ status |= test__subsf3(0xfe800002, 0xfe000003, 0xfe000001);
+ status |= test__subsf3(0xfe800004, 0xfe800003, 0xf3000000);
+ status |= test__subsf3(0xfefffffe, 0x7efffffe, 0xff7ffffe);
+ status |= test__subsf3(0xfefffffe, 0x7effffff, 0xff7ffffe);
+ status |= test__subsf3(0xfefffffe, 0xfefffffe, 0x00000000);
+ status |= test__subsf3(0xfeffffff, 0x3f800000, 0xfeffffff);
+ status |= test__subsf3(0xfeffffff, 0x7f000000, 0xff800000);
+ status |= test__subsf3(0xfeffffff, 0xbf800000, 0xfeffffff);
+ status |= test__subsf3(0xfeffffff, 0xff000000, 0x73000000);
+ status |= test__subsf3(0xff000000, 0x00000000, 0xff000000);
+ status |= test__subsf3(0xff000000, 0x3f800000, 0xff000000);
+ status |= test__subsf3(0xff000000, 0x7f000000, 0xff800000);
+ status |= test__subsf3(0xff000000, 0x7f800000, 0xff800000);
+ status |= test__subsf3(0xff000000, 0x80000000, 0xff000000);
+ status |= test__subsf3(0xff000000, 0xbf800000, 0xff000000);
+ status |= test__subsf3(0xff000000, 0xff800000, 0x7f800000);
+ status |= test__subsf3(0xff000001, 0x7f000000, 0xff800000);
+ status |= test__subsf3(0xff000001, 0xff000000, 0xf3800000);
+ status |= test__subsf3(0xff000001, 0xff000002, 0x73800000);
+ status |= test__subsf3(0xff000002, 0xfe800001, 0xfe800003);
+ status |= test__subsf3(0xff7ffffe, 0x3f800000, 0xff7ffffe);
+ status |= test__subsf3(0xff7ffffe, 0x7f7ffffe, 0xff800000);
+ status |= test__subsf3(0xff7ffffe, 0x7f7fffff, 0xff800000);
+ status |= test__subsf3(0xff7ffffe, 0xbf800000, 0xff7ffffe);
+ status |= test__subsf3(0xff7ffffe, 0xff7fffff, 0x73800000);
+ status |= test__subsf3(0xff7fffff, 0x3f800000, 0xff7fffff);
+ status |= test__subsf3(0xff7fffff, 0x80000001, 0xff7fffff);
+ status |= test__subsf3(0xff7fffff, 0xbf800000, 0xff7fffff);
+ status |= test__subsf3(0xff800000, 0x00000000, 0xff800000);
+ status |= test__subsf3(0xff800000, 0x007fffff, 0xff800000);
+ status |= test__subsf3(0xff800000, 0x7f000000, 0xff800000);
+ status |= test__subsf3(0xff800000, 0x7f800000, 0xff800000);
+ status |= test__subsf3(0xff800000, 0x80000000, 0xff800000);
+ status |= test__subsf3(0xff800000, 0x807fffff, 0xff800000);
+ status |= test__subsf3(0xff800000, 0xff000000, 0xff800000);
+ status |= test__subsf3(0x46f99cee, 0x4656466d, 0x468e79b8);
+ status |= test__subsf3(0x007ffff7, 0x00f7ffff, 0x80780008);
+ status |= test__subsf3(0x80ffffbf, 0x80800000, 0x807fffbf);
+
+ // Test that the result of an operation is a NaN at all when it should be.
+ //
+ // In most configurations these tests' results are checked compared using
+ // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+ // which causes compareResultF to accept any NaN encoding. We also use the
+ // same value as the input NaN in tests that have one, so that even in
+ // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+ // still the exact expected NaN.
+ status |= test__subsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+ status |= test__subsf3(0xff800000, 0xff800000, 0x7fc00000);
+ status |= test__subsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+ status |= test__subsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+ status |= test__subsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+ // Tests specific to the NaN handling of Arm hardware, mimicked by the
+ // subtraction function in arm/addsf3.S:
+ //
+ // - a quiet NaN is distinguished by the top mantissa bit being 1
+ //
+ // - if a signalling NaN appears in the input, the output quiet NaN is
+ // obtained by setting its top mantissa bit and leaving everything else
+ // unchanged
+ //
+ // - if both operands are signalling NaNs then the output NaN is derived
+ // from the first operand
+ //
+ // - if both operands are quiet NaNs then the output NaN is the first
+ // operand
+ //
+ // - invalid operations not involving an input NaN return the quiet
+ // NaN with fewest bits set, 0x7fc00000.
+
+ status |= test__subsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+ status |= test__subsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+ status |= test__subsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+ status |= test__subsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+ status |= test__subsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+ status |= test__subsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+ status |= test__subsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+ status |= test__subsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+ status |= test__subsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+ status |= test__subsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+ status |= test__subsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+ status |= test__subsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+ status |= test__subsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+ status |= test__subsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+ status |= test__subsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+ status |= test__subsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+ status |= test__subsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+ status |= test__subsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+ status |= test__subsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+ status |= test__subsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+ status |= test__subsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+ status |= test__subsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+ status |= test__subsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+ status |= test__subsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+ status |= test__subsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+ status |= test__subsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+ status |= test__subsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+ status |= test__subsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+ status |= test__subsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+ status |= test__subsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+ status |= test__subsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+ status |= test__subsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+ status |= test__subsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+ status |= test__subsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+ status |= test__subsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+ status |= test__subsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+ status |= test__subsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+ status |= test__subsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+ status |= test__subsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+ status |= test__subsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+ status |= test__subsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+ status |= test__subsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+ status |= test__subsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+ status |= test__subsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+ status |= test__subsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+ status |= test__subsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+ status |= test__subsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+ status |= test__subsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+ status |= test__subsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+ status |= test__subsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+ status |= test__subsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+ status |= test__subsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+ status |= test__subsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+ status |= test__subsf3(0xff800000, 0xff800000, 0x7fc00000);
+#endif // ARM_NAN_HANDLING
+
+ return status;
+}
>From 46e7005df1ec7b058114b7d864a597b48f2d2706 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 5 Feb 2026 13:49:21 +0000
Subject: [PATCH 2/3] clang-format
---
compiler-rt/test/builtins/Unit/addsf3_test.c | 9 +++++----
compiler-rt/test/builtins/Unit/subsf3_test.c | 9 +++++----
2 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/compiler-rt/test/builtins/Unit/addsf3_test.c b/compiler-rt/test/builtins/Unit/addsf3_test.c
index a08ba8b91056a..5d20970047d8a 100644
--- a/compiler-rt/test/builtins/Unit/addsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/addsf3_test.c
@@ -24,7 +24,8 @@
// Returns: a + b
COMPILER_RT_ABI float __addsf3(float a, float b);
-int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep,
+ uint32_t expected_rep) {
float a = fromRep32(a_rep), b = fromRep32(b_rep);
float x = __addsf3(a, b);
#ifdef EXPECT_EXACT_RESULTS
@@ -34,14 +35,14 @@ int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep
#endif
if (ret) {
- printf("error at line %d: __addsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
- ", expected %08" PRIx32 "\n",
+ printf("error at line %d: __addsf3(%08" PRIx32 ", %08" PRIx32
+ ") = %08" PRIx32 ", expected %08" PRIx32 "\n",
line, a_rep, b_rep, toRep32(x), expected_rep);
}
return ret;
}
-#define test__addsf3(a,b,x) (test__addsf3)(__LINE__,a,b,x)
+#define test__addsf3(a, b, x) (test__addsf3)(__LINE__, a, b, x)
int main() {
int status = 0;
diff --git a/compiler-rt/test/builtins/Unit/subsf3_test.c b/compiler-rt/test/builtins/Unit/subsf3_test.c
index b9c1b2ac4362a..11a87d1526785 100644
--- a/compiler-rt/test/builtins/Unit/subsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/subsf3_test.c
@@ -24,7 +24,8 @@
// Returns: a - b
COMPILER_RT_ABI float __subsf3(float a, float b);
-int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep,
+ uint32_t expected_rep) {
float a = fromRep32(a_rep), b = fromRep32(b_rep);
float x = __subsf3(a, b);
#ifdef EXPECT_EXACT_RESULTS
@@ -34,14 +35,14 @@ int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep
#endif
if (ret) {
- printf("error at line %d: __subsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
- ", expected %08" PRIx32 "\n",
+ printf("error at line %d: __subsf3(%08" PRIx32 ", %08" PRIx32
+ ") = %08" PRIx32 ", expected %08" PRIx32 "\n",
line, a_rep, b_rep, toRep32(x), expected_rep);
}
return ret;
}
-#define test__subsf3(a,b,x) test__subsf3(__LINE__,a,b,x)
+#define test__subsf3(a, b, x) test__subsf3(__LINE__, a, b, x)
int main() {
int status = 0;
>From 0c7ea5933fe27b3d6526f3cac2f20fdcc794dc0f Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 5 Feb 2026 17:09:40 +0000
Subject: [PATCH 3/3] Update to use set_special_properties
---
compiler-rt/lib/builtins/CMakeLists.txt | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 1366d4aa75c03..c741f9bf9c3d9 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -535,8 +535,7 @@ set(thumb1_base_SOURCES
${GENERIC_SOURCES}
)
# arm/addsf3.S implements both addition and subtraction via cross-branching
-set_property(SOURCE arm/addsf3.S PROPERTY crt_supersedes subsf3.c)
-set_property(SOURCE arm/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
+set_special_properties(arm/addsf3.S SUPERSEDES subsf3.c PROVIDES subsf3)
set_special_properties(arm/adddf3.S SUPERSEDES subdf3.c PROVIDES subdf3)
if(COMPILER_RT_ARM_OPTIMIZED_FP)
@@ -554,8 +553,8 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP)
arm/funder.c
${thumb1_base_SOURCES}
)
- set_property(SOURCE arm/thumb1/addsf3.S PROPERTY crt_supersedes subsf3.c)
- set_property(SOURCE arm/thumb1/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
+ set_special_properties(arm/thumb1/addsf3.S
+ SUPERSEDES subsf3.c PROVIDES subsf3)
set_special_properties(arm/thumb1/cmpdf2.S
SUPERSEDES comparedf2.c PROVIDES comparedf2)
set_special_properties(arm/thumb1/cmpsf2.S
More information about the llvm-branch-commits
mailing list