[llvm-branch-commits] [compiler-rt] [compiler-rt][ARM] Optimized single precision FP add/sub (PR #179929)

Simon Tatham via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Feb 6 06:09:52 PST 2026


https://github.com/statham-arm updated https://github.com/llvm/llvm-project/pull/179929

>From d71c66686332436d87c857be79d0bedafe3cf206 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 29 Jan 2026 16:20:54 +0000
Subject: [PATCH 1/3] [compiler-rt][ARM] Optimized single precision FP add/sub

This replaces the previous Thumb1-specific addsf3 with both Thumb1 and
Arm/Thumb2 add/sub.

I've removed the old Thumb1 addsf3 completely, partly because this
implementation is expected to be faster, and partly because the new
tests exposed a bug in the old implementation. However the new
implementation does consume more code, so perhaps putting the old
implementation back as an alternative with the bug fixed might be a
useful option.
---
 compiler-rt/lib/builtins/CMakeLists.txt      |   8 +-
 compiler-rt/lib/builtins/arm/addsf3.S        | 967 ++++++++++++++-----
 compiler-rt/lib/builtins/arm/thumb1/addsf3.S | 888 +++++++++++++++++
 compiler-rt/test/builtins/Unit/addsf3_test.c | 384 ++++++++
 compiler-rt/test/builtins/Unit/subsf3_test.c | 382 ++++++++
 5 files changed, 2368 insertions(+), 261 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/addsf3.S
 create mode 100644 compiler-rt/test/builtins/Unit/addsf3_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/subsf3_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index ed6a9d00db069..ac7396f6ba481 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -445,6 +445,7 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
 
   if(implicit_it_flag)
     set(assembly_files
+      arm/addsf3.S
       arm/mulsf3.S
       arm/divsf3.S
       arm/adddf3.S
@@ -518,15 +519,18 @@ set(thumb1_base_SOURCES
   arm/divsi3.S
   arm/udivsi3.S
   arm/comparesf2.S
-  arm/addsf3.S
   ${GENERIC_SOURCES}
 )
+# arm/addsf3.S implements both addition and subtraction via cross-branching
+set_property(SOURCE arm/addsf3.S PROPERTY crt_supersedes subsf3.c)
+set_property(SOURCE arm/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
 set_property(SOURCE arm/adddf3.S PROPERTY crt_supersedes subdf3.c)
 set_property(SOURCE arm/adddf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subdf3)
 
 if(COMPILER_RT_ARM_OPTIMIZED_FP)
   set(thumb1_base_SOURCES
     arm/thumb1/mulsf3.S
+    arm/thumb1/addsf3.S
     arm/thumb1/cmpdf2.S
     arm/thumb1/cmpsf2.S
     arm/thumb1/gedf2.S
@@ -538,6 +542,8 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP)
     arm/funder.c
     ${thumb1_base_SOURCES}
   )
+  set_property(SOURCE arm/thumb1/addsf3.S PROPERTY crt_supersedes subsf3.c)
+  set_property(SOURCE arm/thumb1/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
   set_property(SOURCE arm/thumb1/cmpdf2.S PROPERTY crt_supersedes comparedf2.c)
   set_property(SOURCE arm/thumb1/cmpdf2.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides comparedf2)
   set_property(SOURCE arm/thumb1/cmpsf2.S PROPERTY crt_supersedes comparesf2.S)
diff --git a/compiler-rt/lib/builtins/arm/addsf3.S b/compiler-rt/lib/builtins/arm/addsf3.S
index 7b7cf85922753..7e2daff571a31 100644
--- a/compiler-rt/lib/builtins/arm/addsf3.S
+++ b/compiler-rt/lib/builtins/arm/addsf3.S
@@ -1,4 +1,4 @@
-//===-- addsf3.S - Adds two single precision floating pointer numbers-----===//
+//===-- addsf3.S - Add/subtract single precision floating point numbers ---===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,271 +6,718 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the __addsf3 (single precision floating pointer number
-// addition with the IEEE-754 default rounding (to nearest, ties to even)
-// function for the ARM Thumb1 ISA.
+// This file implements the __addsf3 and __subsf3 functions (single precision
+// floating point number addition and subtraction), with the IEEE-754 default
+// rounding (to nearest, ties to even), for the Arm and Thumb2 ISAs.
 //
 //===----------------------------------------------------------------------===//
 
 #include "../assembly.h"
-#define significandBits 23
-#define typeWidth 32
 
-	.syntax unified
-	.text
-  .thumb
+  .syntax unified
+  .text
   .p2align 2
 
-DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fadd, __addsf3)
-
-DEFINE_COMPILERRT_THUMB_FUNCTION(__addsf3)
-  push {r4, r5, r6, r7, lr}
-  // Get the absolute value of a and b.
-  lsls r2, r0, #1
-  lsls r3, r1, #1
-  lsrs r2, r2, #1  // aAbs
-  beq  LOCAL_LABEL(a_zero_nan_inf)
-  lsrs r3, r3, #1  // bAbs
-  beq  LOCAL_LABEL(zero_nan_inf)
-
-  // Detect if a or b is infinity or Nan.
-  lsrs r6, r2, #(significandBits)
-  lsrs r7, r3, #(significandBits)
-  cmp  r6, #0xFF
-  beq  LOCAL_LABEL(zero_nan_inf)
-  cmp  r7, #0xFF
-  beq  LOCAL_LABEL(zero_nan_inf)
-
-  // Swap Rep and Abs so that a and aAbs has the larger absolute value.
-  cmp r2, r3
-  bhs LOCAL_LABEL(no_swap)
-  movs r4, r0
-  movs r5, r2
-  movs r0, r1
-  movs r2, r3
-  movs r1, r4
-  movs r3, r5
-LOCAL_LABEL(no_swap):
-
-  // Get the significands and shift them to give us round, guard and sticky.
-  lsls r4, r0, #(typeWidth - significandBits)
-  lsrs r4, r4, #(typeWidth - significandBits - 3) // aSignificand << 3
-  lsls r5, r1, #(typeWidth - significandBits)
-  lsrs r5, r5, #(typeWidth - significandBits - 3) // bSignificand << 3
-
-  // Get the implicitBit.
-  movs r6, #1
-  lsls r6, r6, #(significandBits + 3)
-
-  // Get aExponent and set implicit bit if necessary.
-  lsrs r2, r2, #(significandBits)
-  beq LOCAL_LABEL(a_done_implicit_bit)
-  orrs r4, r6
-LOCAL_LABEL(a_done_implicit_bit):
-
-  // Get bExponent and set implicit bit if necessary.
-  lsrs r3, r3, #(significandBits)
-  beq LOCAL_LABEL(b_done_implicit_bit)
-  orrs r5, r6
-LOCAL_LABEL(b_done_implicit_bit):
-
-  // Get the difference in exponents.
-  subs r6, r2, r3
-  beq LOCAL_LABEL(done_align)
-
-  // If b is denormal, then a must be normal as align > 0, and we only need to
-  // right shift bSignificand by (align - 1) bits.
-  cmp  r3, #0
-  bne  1f
-  subs r6, r6, #1
-1:
-
-  // No longer needs bExponent. r3 is dead here.
-  // Set sticky bits of b: sticky = bSignificand << (typeWidth - align).
-  movs r3, #(typeWidth)
-  subs r3, r3, r6
-  movs r7, r5
-  lsls r7, r3
-  beq 1f
-  movs r7, #1
-1:
-
-  // bSignificand = bSignificand >> align | sticky;
-  lsrs r5, r6
-  orrs r5, r7
-  bne LOCAL_LABEL(done_align)
-  movs r5, #1 //  sticky; b is known to be non-zero.
-
-LOCAL_LABEL(done_align):
-  // isSubtraction = (aRep ^ bRep) >> 31;
-  movs r7, r0
-  eors r7, r1
-  lsrs r7, #31
-  bne LOCAL_LABEL(do_substraction)
-
-  // Same sign, do Addition.
-
-  // aSignificand += bSignificand;
-  adds r4, r4, r5
-
-  // Check carry bit.
-  movs r6, #1
-  lsls r6, r6, #(significandBits + 3 + 1)
-  movs r7, r4
-  ands r7, r6
-  beq LOCAL_LABEL(form_result)
-  // If the addition carried up, we need to right-shift the result and
-  // adjust the exponent.
-  movs r7, r4
-  movs r6, #1
-  ands r7, r6 // sticky = aSignificand & 1;
-  lsrs r4, #1
-  orrs r4, r7  // result Significand
-  adds r2, #1  // result Exponent
-  // If we have overflowed the type, return +/- infinity.
-  cmp  r2, 0xFF
-  beq  LOCAL_LABEL(ret_inf)
-
-LOCAL_LABEL(form_result):
-  // Shift the sign, exponent and significand into place.
-  lsrs r0, #(typeWidth - 1)
-  lsls r0, #(typeWidth - 1) // Get Sign.
-  lsls r2, #(significandBits)
-  orrs r0, r2
-  movs r1, r4
-  lsls r4, #(typeWidth - significandBits - 3)
-  lsrs r4, #(typeWidth - significandBits)
-  orrs r0, r4
-
-  // Final rounding.  The result may overflow to infinity, but that is the
-  // correct result in that case.
-  // roundGuardSticky = aSignificand & 0x7;
-  movs r2, #0x7
-  ands r1, r2
-  // if (roundGuardSticky > 0x4) result++;
-
-  cmp r1, #0x4
-  blt LOCAL_LABEL(done_round)
-  beq 1f
-  adds r0, #1
-  pop {r4, r5, r6, r7, pc}
-1:
-
-  // if (roundGuardSticky == 0x4) result += result & 1;
-  movs r1, r0
-  lsrs r1, #1
-  bcc  LOCAL_LABEL(done_round)
-  adds r0, r0, #1
-LOCAL_LABEL(done_round):
-  pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(do_substraction):
-  subs r4, r4, r5 // aSignificand -= bSignificand;
-  beq  LOCAL_LABEL(ret_zero)
-  movs r6, r4
-  cmp  r2, 0
-  beq  LOCAL_LABEL(form_result) // if a's exp is 0, no need to normalize.
-  // If partial cancellation occurred, we need to left-shift the result
-  // and adjust the exponent:
-  lsrs r6, r6, #(significandBits + 3)
-  bne LOCAL_LABEL(form_result)
-
-  push {r0, r1, r2, r3}
-  movs r0, r4
-  bl   SYMBOL_NAME(__clzsi2)
-  movs r5, r0
-  pop {r0, r1, r2, r3}
-  // shift = rep_clz(aSignificand) - rep_clz(implicitBit << 3);
-  subs r5, r5, #(typeWidth - significandBits - 3 - 1)
-  // aSignificand <<= shift; aExponent -= shift;
-  lsls r4, r5
-  subs  r2, r2, r5
-  bgt LOCAL_LABEL(form_result)
-
-  // Do normalization if aExponent <= 0.
-  movs r6, #1
-  subs r6, r6, r2 // 1 - aExponent;
-  movs r2, #0 // aExponent = 0;
-  movs r3, #(typeWidth) // bExponent is dead.
-  subs r3, r3, r6
-  movs r7, r4
-  lsls r7, r3  // stickyBit = (bool)(aSignificant << (typeWidth - align))
-  beq 1f
-  movs r7, #1
-1:
-  lsrs r4, r6 // aSignificand >> shift
-  orrs r4, r7
-  b LOCAL_LABEL(form_result)
-
-LOCAL_LABEL(ret_zero):
-  movs r0, #0
-  pop {r4, r5, r6, r7, pc}
-
-
-LOCAL_LABEL(a_zero_nan_inf):
-  lsrs r3, r3, #1
-
-LOCAL_LABEL(zero_nan_inf):
-  // Here  r2 has aAbs, r3 has bAbs
-  movs r4, #0xFF
-  lsls r4, r4, #(significandBits) // Make +inf.
-
-  cmp r2, r4
-  bhi LOCAL_LABEL(a_is_nan)
-  cmp r3, r4
-  bhi LOCAL_LABEL(b_is_nan)
-
-  cmp r2, r4
-  bne LOCAL_LABEL(a_is_rational)
-  // aAbs is INF.
-  eors r1, r0 // aRep ^ bRep.
-  movs r6, #1
-  lsls r6, r6, #(typeWidth - 1) // get sign mask.
-  cmp r1, r6 // if they only differ on sign bit, it's -INF + INF
-  beq LOCAL_LABEL(a_is_nan)
-  pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(a_is_rational):
-  cmp r3, r4
-  bne LOCAL_LABEL(b_is_rational)
-  movs r0, r1
-  pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(b_is_rational):
-  // either a or b or both are zero.
-  adds r4, r2, r3
-  beq  LOCAL_LABEL(both_zero)
-  cmp r2, #0 // is absA 0 ?
-  beq LOCAL_LABEL(ret_b)
-  pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(both_zero):
-  ands r0, r1 // +0 + -0 = +0
-  pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(ret_b):
-  movs r0, r1
-
-LOCAL_LABEL(ret):
-  pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(b_is_nan):
-  movs r0, r1
-LOCAL_LABEL(a_is_nan):
-  movs r1, #1
-  lsls r1, r1, #(significandBits -1) // r1 is quiet bit.
-  orrs r0, r1
-  pop {r4, r5, r6, r7, pc}
-
-LOCAL_LABEL(ret_inf):
-  movs r4, #0xFF
-  lsls r4, r4, #(significandBits)
-  orrs r0, r4
-  lsrs r0, r0, #(significandBits)
-  lsls r0, r0, #(significandBits)
-  pop {r4, r5, r6, r7, pc}
-
-
-END_COMPILERRT_FUNCTION(__addsf3)
+// General structure of this code:
+//
+// There are three actual entry points here, for addition, subtraction and
+// reversed subtraction (just taking the operands the other way round, so that
+// it returns y-x instead of x-y). But the first thing the functions do (after
+// checking for NaNs) is to sort out whether the magnitudes of the two inputs
+// are being added (x+y with like signs, or x-y with different signs), or
+// subtracted. So fadd jumps across into the middle of fsub if it sees that the
+// signs are different, and vice versa. Then the main code path in fadd handles
+// magnitude addition, and the one in fsub handles magnitude subtraction.
+//
+// NaNs are checked first, so that an input NaN can be propagated exactly,
+// including its sign bit. After ruling out that case, it's safe to flip the
+// sign of one of the inputs, so that during the cross-calls, x - y can be
+// rewritten as x + (-y) and vice versa.
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__addsf3)
+  push {r4, lr}
+  vmov r0, s0
+  vmov r1, s1
+  bl __aeabi_fadd
+  vmov s0, r0
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__addsf3, __aeabi_fadd)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_fadd)
+  // Test for all uncommon values at once: infinities, NaNs, denormals and
+  // zeroes. Branch out of line if any are found. We do this by XORing each
+  // input with itself shifted left by a bit, which means that exponents 00 and
+  // FF will both end up with seven zero bits at the top.
+  eor     r2, r0, r0, lsl #1   // combine x with itself shifted
+  eor     r3, r1, r1, lsl #1   // same for y
+  tst     r2, #0x7F000000      // is x uncommon?
+  tstne   r3, #0x7F000000      // if not, is y uncommon?
+  beq     LOCAL_LABEL(add_uncommon)        // if either, branch out of line
+
+  // Now we have two normalised numbers. If their signs are opposite, we should
+  // be subtracting their magnitudes rather than adding, so cross-jump to fsub.
+  teq     r0, r1               // set N if signs are unequal
+  eormi   r1, r1, #1 << 31     // if so, flip the sign of y
+  bmi     LOCAL_LABEL(sub_magnitude)       // and go to magnitude subtraction
+LOCAL_LABEL(add_magnitude):
+  // If we get here, we're adding operands with equal signs (i.e. a magnitude
+  // addition). First thing to do is put the operands in magnitude order, so
+  // that x >= y.
+  subs    r2, r0, r1           // compare inputs, also keeping x-y
+  sublo   r0, r0, r2           // if x<y then turn x into y, using value in r2
+  addlo   r1, r1, r2           // and similarly turn y into x
+
+  // Keep the sign and exponent of the larger input, to use as the sign and
+  // exponent of the output (up to carries and overflows). Also calculate the
+  // exponent difference, which tells us how far we'll need to shift y's
+  // mantissa right to add it to x's.
+  //
+  // The shifted-right values will include the sign bits as well as the
+  // exponents, but that's OK, in this branch the two sign bits are the same,
+  // so they'll cancel when subtracted.
+  //
+  // The exponent difference can be as large as 0xFE (maximum exponent minus
+  // minimum), which still fits in 8 bits, so shifting right by that amount is
+  // well defined in AArch32.
+  mov     r2, r0, lsr #23
+  sub     r3, r2, r1, lsr #23
+
+  // Extract both mantissas, moved up to the top of the word, with the leading
+  // 1 made explicit.
+  mov     r12, #1 << 31        // the leading 1 by itself
+  orr     r0, r12, r0, lsl #8
+  orr     r1, r12, r1, lsl #8
+
+LOCAL_LABEL(add_doadd):
+  // Here we perform the actual addition. We either fell through from the code
+  // above, or jumped back to here after handling an input denormal.
+  //
+  // We get here with:
+  //   Operands known to be numeric rather than zero/infinity/NaN;
+  //   r0 = mantissa of larger operand (in high 24 bits);
+  //   r1 = mantissa of smaller operand (in high 24 bits);
+  //   r2 = result sign and exponent (in low 9 bits);
+  //   r3 = exponent difference.
+  //
+  // For normal inputs, the mantissa registers (r0,r1) will have the top bit
+  // set. Denormals will leave that bit clear, treating the number as
+  // 0.[mantissa] x 2^(fixed exponent) instead of renormalising to 1.[mantissa]
+  // x 2^(variable exponent) as a multiplication would want.
+
+  // Actually shift the smaller mantissa downwards and add them together.
+#if !__thumb__
+  adds    r12, r0, r1, lsr r3  // CS if x >= 2.0
+#else
+  // Thumb can't fold a register-controlled shift into an add, so we must use
+  // two separate instructions.
+  lsr     r12, r1, r3
+  adds    r12, r0, r12
+#endif
+
+  // If that addition carried off the top of r12, then the number has increased
+  // its exponent. Diverge into a completely separate code path for that case,
+  // because there we must check for overflow.
+  bcs     LOCAL_LABEL(add_carry)
+
+  // Here, on the non-carrying path, we don't need to check for overflow at
+  // all. If there is an overflow it can only be due to rounding up, so the
+  // overflowed mantissa will be all zeroes, so the naively generated output
+  // will look like the correct infinity anyway.
+  //
+  // We shift the mantissa down to its final position, and recombine it with
+  // the sign + exponent (in r2) via addition. We keep the bit shifted off the
+  // bottom of the mantissa in C, and then use ADC for the recombination, which
+  // causes us to round up if that bit was set without needing an extra
+  // instruction. But the leading bit of the mantissa increments the exponent
+  // field unwantedly, so we must decrement r2 first to compensate for that.
+  sub     r2, r2, #1
+  movs    r0, r12, lsr #8
+  adc     r0, r0, r2, lsl #23
+
+  // If we _didn't_ round up, then we're done.
+  bxcc    lr
+
+  // But if we did round up, then we must also check if we need to round to
+  // even. This occurs if all the bits of y's mantissa shifted off the bottom
+  // are zero except for the round bit.
+  //
+  // Some of those bits are in r12 (the 32-bit version of the sum's mantissa).
+  // It's cheap to check those, and should exclude _most_ cases where
+  // round-to-even isn't needed.
+  tst     r12, #127
+  bxne    lr
+
+  // Failing that, we have to go back to the original mantissa of y (still in
+  // r1) and work out exactly how many bits of it to check.
+  rsb     r3, r3, #32  // opposite of the amount we shifted y right by
+  lsls    r1, r1, r3   // shift y left by that amount instead
+
+  // Now if Z is set, we do round to even, which works by just clearing the low
+  // bit of the output mantissa. This undoes the round-up if we rounded up to
+  // an odd mantissa, and otherwise, makes no difference.
+  biceq   r0, r0, #1
+
+  // And now we're done.
+  bx      lr
+
+LOCAL_LABEL(add_carry):
+  // This is the separate code path in which adding the mantissas together
+  // caused a carry off the top of the word, so that the exponent of the output
+  // incremented (even before rounding). Start by shifting the carry bit back
+  // in.
+  rrx     r0, r12
+
+  // Now recombine the sign and exponent, and do the basic rounding (apart from
+  // round to even), in the same way as the non-carrying code path above.
+  // However this time we don't decrement r2, because we want our exponent to
+  // come out bigger by 1 than in the other code path.
+  movs    r0, r0, lsr #8       // shift mantissa down to the right position
+  adc     r0, r0, r2, lsl #23  // recombine with sign+exponent, and round
+
+  // Note that the mantissa cannot have overflowed during rounding: if it has
+  // all bits 1 before rounding, both operands must also have had all mantissa
+  // bits 1, and the same exponent - which implies the round bit was 0.
+  //
+  // So we definitely have the correct output exponent. There are two problems
+  // left: we might need to round to even, and we might have overflowed.
+
+  // First, do the cheap check that _usually_ rules out round-to-even. We only
+  // do this if C is set (i.e. if we rounded up), and we end up with Z=0 if no
+  // RTE. This relies on also having Z=0 already, in the case where we _didn't_
+  // round up - and that must be true because the last time we set the flags it
+  // was by shifting down the output mantissa, and that will always have had
+  // its leading bit set.
+  tstcs   r12, #255       // test one more bit than on the no-carry path
+
+  // Now if Z=1 then we need to do the full check for RTE. But first, prepare a
+  // version of the output value shifted left by 1 where it's convenient to
+  // check its exponent for overflow. (We couldn't do that until we'd finished
+  // with r12 by testing it in the previous instruction.)
+  mov     r12, r0, lsl #1
+
+  // Now, if we need to check for RTE, go off and do it.
+  beq     LOCAL_LABEL(add_roundeven_ovf)
+
+  // Otherwise, we still need to check for overflow.
+  cmp     r12, #0xff000000  // if r12 >= this, the exponent has overflowed
+  bxlo    lr                // so if not, we can leave
+  b       LOCAL_LABEL(add_ovf)          // but if so, go and handle overflow
+
+LOCAL_LABEL(add_roundeven_ovf):
+  // We came here if we detected a need to do the full check for RTE. But we
+  // may _also_ have overflowed, and just not have noticed yet.
+
+  // Same round-to-even check as in the non-carry case above.
+  rsb     r3, r3, #32  // opposite of the amount we shifted y right by
+  lsls    r1, r1, r3   // shift y left by that amount instead
+  biceq   r0, r0, #1   // and if the remaining bits are all 0, round to even
+
+  // Now check for overflow, and if none, we're done.
+  cmp     r12, #0xff000000  // if r12 >= this, the exponent has overflowed
+  bxlo    lr                // so if not, we can leave
+
+  // If we get here, we have definitely overflowed. Moreover, the exponent
+  // field of the number is exactly 0xff. So all we have to do is clear the
+  // mantissa, to make it into an infinity of the output sign.
+LOCAL_LABEL(add_ovf):
+  bfc     r0, #0, #23
+  bx      lr
+
+LOCAL_LABEL(add_uncommon):
+  // We come here if the entry-point check says that at least one of x and y
+  // has an uncommon (FF or 00) exponent. So we have at least one NaN,
+  // infinity, denormal or zero, but we don't know which, or which operand it's
+  // in. And we could have any combination of those types of input, in _both_
+  // operands.
+
+  // Detect FF exponents (NaNs or infinities) and branch again for those.
+  mov     r12, #0xFF000000
+  bics    r2, r12, r0, lsl #1
+  bicsne  r2, r12, r1, lsl #1
+  beq     LOCAL_LABEL(add_naninf)
+
+  // Now we know both inputs are finite, but there may be denormals or zeroes.
+  // So it's safe to do the same sign check and cross-jump as we did on the
+  // fast path.
+  teq     r0, r1             // opposite signs?
+  eormi   r1, r1, #1 << 31   // if so, negate the second operand
+  bmi     LOCAL_LABEL(sub_zerodenorm)    // and cross-jump to the fsub version of this code
+LOCAL_LABEL(add_zerodenorm):
+  // Now we know x and y have the same sign, and at least one of them is zero
+  // or denormal. If there aren't any zeroes, we'll end up rejoining the fast
+  // path, so we must set up all the same registers, and do our checks for zero
+  // in line with that.
+  //
+  // Start by exactly repeating the initial fast-path setup code: sort into
+  // magnitude order, get the output sign+exponent and the exponent shift.
+  subs    r2, r0, r1           // compare inputs, also keeping x-y
+  sublo   r0, r0, r2           // if x<y then turn x into y, using value in r2
+  addlo   r1, r1, r2           // and similarly turn y into x
+  mov     r2, r0, lsr #23      // get exponent of x (the sign bit will cancel)
+  sub     r3, r2, r1, lsr #23  // subtract exponent of y to get shift count
+
+  // Shift y's mantissa up to the top of r1. We know y has exponent 0 (at least
+  // one of the inputs does, and we've sorted them by now). So we definitely
+  // don't need to set the leading bit on y's mantissa; also, if r1 becomes
+  // zero, then we know we have an addition to 0, and otherwise, we know both
+  // inputs are nonzero.
+  movs    r1, r1, lsl #8       // is y zero?
+  bxeq    lr                   // if so, just return x
+
+  // Now we know there aren't any zeroes, and that y is a denormal. x might or
+  // might not be a denormal, so we must check that and decide whether to set
+  // its top mantissa bit.
+  mov     r0, r0, lsl #8       // shift mantissa of x to the top of r0
+  tst     r2, #255             // is x's exponent 0? If so, it's denormal
+  orrne   r0, r0, #1 << 31     // if not, set leading bit of x,
+  subne   r3, r3, #1           //   adjust exponent difference,
+  bne     LOCAL_LABEL(add_doadd)           //  and go back to mainstream
+
+  // If both operands are denormals, addition becomes trivial: denormals and
+  // the smallest exponent of normalised numbers both multiply the mantissa by
+  // the same power of 2, so we can just add the mantissas together and put the
+  // output sign back on.
+  add     r0, r0, r1           // make the output mantissa
+  mov     r0, r0, lsr #8       // shift it into position
+  orr     r0, r0, r2, lsl #23  // put the sign back at the top
+  bx      lr                   // done!
+
+LOCAL_LABEL(add_naninf):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+  // the input.
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+
+LOCAL_LABEL(add_inf):
+  // No NaNs, so we have at least one infinity. Almost all additions involving
+  // an infinity return the input infinity unchanged. The only exception is if
+  // there are two infinities that have opposite signs (which can happen even
+  // inf fadd, since on this code path we haven't cross-jumped into fsub),
+  // where we return NaN.
+  eor     r2, r0, r1               // see how the two inputs differ
+  cmp     r2, #0x80000000          // +inf + -inf?
+  subeq   r0, r2, #0x00400000      // if so, make the default output QNaN
+  bxeq    lr                       // and return it
+  cmp     r12, r0, lsl #1          // otherwise, is r0 the infinity?
+  movne   r0, r1                   // no, so it's r1
+  bx      lr                       // return the infinite input unchanged
+
+END_COMPILERRT_FUNCTION(__aeabi_fadd)
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_frsub)
+  // Reversed subtraction, that is, compute y-x, where x is in r0 and y in r1.
+  //
+  // We could implement this by simply swapping r0 with r1. But the point of
+  // having a reversed-subtract in the first place is to avoid the caller
+  // having to do that, so if we do it ourselves, it wastes all the time they
+  // saved. So instead, on the fast path, we redo the sign check our own way
+  // and branch to fadd_magnitude or fsub_magnitude.
+
+  // First rule out denormals and zeroes, using the same test as fadd and fsub.
+  eor     r2, r0, r0, lsl #1
+  eor     r3, r1, r1, lsl #1
+  tst     r2, #0x7F000000
+  tstne   r3, #0x7F000000
+  beq     LOCAL_LABEL(rsb_uncommon)
+
+  // Now we know we only have finite inputs, it's safe to implement the
+  // reversal of the operand order by flipping signs. (Preserving the sign of
+  // an input NaN was the only case where that wasn't right.)
+
+  eor     r0, r0, #1 << 31  // flip sign of the operand we're subtracting
+  teq     r0, r1            // are the signs now the same?
+  bpl     LOCAL_LABEL(add_magnitude)    // if so, we're doing magnitude addition
+  eor     r1, r1, #1 << 31  // otherwise, flip the other sign too
+  b       LOCAL_LABEL(sub_magnitude)    // and we're doing magnitude subtraction
+
+LOCAL_LABEL(rsb_uncommon):
+  // Any uncommon operands to frsub are handled by just swapping the two
+  // operands and going to fsub's handler. We're off the main fast path now, so
+  // there's no need to try to optimise it any harder.
+  eor     r0, r0, r1
+  eor     r1, r1, r0
+  eor     r0, r0, r1
+  b       LOCAL_LABEL(sub_uncommon)
+
+END_COMPILERRT_FUNCTION(__aeabi_frsub)
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__subsf3)
+  push {r4, lr}
+  vmov r0, s0
+  vmov r1, s1
+  bl __aeabi_fsub
+  vmov s0, r0
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__subsf3, __aeabi_fsub)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_fsub)
+  // Main entry point for subtraction.
+  //
+  // Start by testing for uncommon operands in the usual way.
+  eor     r2, r0, r0, lsl #1
+  eor     r3, r1, r1, lsl #1
+  tst     r2, #0x7F000000
+  tstne   r3, #0x7F000000
+  beq     LOCAL_LABEL(sub_uncommon)
+
+  // Check the signs, and if they're unequal, cross-jump into fadd to do
+  // magnitude addition. (Now we've excluded NaNs, it's safe to flip the sign
+  // of y.)
+  teq     r0, r1
+  eormi   r1, r1, #1 << 31
+  bmi     LOCAL_LABEL(add_magnitude)
+
+LOCAL_LABEL(sub_magnitude):
+  // If we get here, we're subtracting operands with equal signs (i.e. a
+  // magnitude subtraction). First thing to do is put operands in magnitude
+  // order, so that x >= y. However, if they are swapped, we must also negate
+  // both of them, since x - y = (-y) - (-x).
+  subs    r2, r0, r1            // LO if we must swap the operands
+#if !__thumb__
+  // Conditional on LO, swap the operands, by adding/subtracting the difference
+  // between them that we just wrote into r2. Negate them both in the process
+  // by flipping the high bit of r2 first.
+  eorlo   r2, r2, #1 << 31
+  sublo   r0, r0, r2
+  addlo   r1, r1, r2
+#else
+  // In Thumb, conditionally branch round these three instructions, instead of
+  // conditionally executing them with an ITTT LO. Rationale: on the simpler
+  // Thumb-only cores such as Cortex-M3, a branch only takes two cycles and an
+  // IT costs one, so this saves two cycles in the untaken case and doesn't
+  // impact the taken case at all.
+  bhs     0f
+  eor     r2, r2, #1 << 31
+  sub     r0, r0, r2
+  add     r1, r1, r2
+0:
+#endif
+
+  // Save the sign and exponent of the larger operand to use for the result (up
+  // to renormalisation), and calculate the exponent difference for shifting
+  // one mantissa relative to the other.
+  mov     r2, r0, lsr #23      // r2 = sign<<8 + exponent
+  sub     r3, r2, r1, lsr #23  // shift = 0..254 (sign bits cancel)
+
+  // Shift the mantissas up to the top of the words, and OR in the leading 1
+  // for each.
+  mov     r12, #1 << 31
+  orr     r0, r12, r0, lsl #8
+  orr     r1, r12, r1, lsl #8
+
+LOCAL_LABEL(sub_dosub):
+  // Here we perform the actual subtraction. We either fell through from the
+  // code above, or jumped back to here after handling an input denormal.
+  //
+  // We get here with:
+  //   Operands known to be numeric rather than zero/infinity/NaN;
+  //   r0 = mantissa of larger operand (in high 24 bits);
+  //   r1 = mantissa of smaller operand (in high 24 bits);
+  //   r2 = result sign/exponent (in low 9 bits)
+  //   r3 = exponent difference.
+  //
+  // Begin calculating the output mantissa by shifting y's mantissa right and
+  // subtracting. This may leave the mantissa too large by one, if the bits
+  // shifted out of y are nonzero. We correct this during rounding if
+  // necessary.
+#if !__thumb__
+  subs    r12, r0, r1, lsr r3    // MI if high bit set
+#else
+  // Thumb can't fold a register-controlled shift into a sub, so we must use
+  // two separate instructions.
+  lsr     r12, r1, r3
+  subs    r12, r0, r12
+#endif
+
+  // This may have cleared the high bit of the output mantissa, in which case
+  // we must renormalise. Our strategy is to split into three code paths, on
+  // two of which an awkward case is known not to arise:
+  //  * no need to renormalise at all => underflow can't happen
+  //  * shift up by exactly 1 bit
+  //  * shift up by more than 1 bit => rounding can't happen (result is exact)
+  //
+  // First branch out of line for the first case, which we can detect because
+  // the N flag tells us whether the top mantissa bit is still set.
+  bmi     LOCAL_LABEL(sub_renorm_0)
+
+  // Now we know we're renormalising by at least one bit, which also means
+  // underflow is a risk.
+  //
+  // If we're shifting by only one bit, then underflow can only occur if the
+  // exponent was originally 1. So test both those conditions together, and if
+  // the shift is only one bit _and_ the exponent is > 1, we know we can
+  // renormalise by one bit and not worry about underflow.
+  tst     r2, #254  // test all but low bit of exponent; also clears N
+#if !__thumb__
+  movsne  r0, r12, lsl #1  // set N if non-underflowing _and_ top bit now set
+#else
+  // In Thumb, there's no advantage in combining the two tests, since the IT
+  // between them costs a cycle. Do the explicit branch now to fsub_underflow
+  // (because now we _know_ we have underflow).
+  beq     LOCAL_LABEL(sub_underflow)
+  // And then unconditionally do the shift.
+  movs    r0, r12, lsl #1  // check whether 2nd bit is cleared (PL)
+#endif
+  // After all that, N is clear if we still haven't set the top mantissa bit,
+  // either because we shifted up by a bit and it didn't help, or (in Arm state
+  // only) because we detected underflow and didn't do the shift at all.
+  //
+  // The case of 'haven't yet done the shift' is reliably indicated by the Z
+  // flag being set, because if we did do the shift, it will always have
+  // cleared Z.
+  bpl     LOCAL_LABEL(sub_renorm_orunder)
+
+  // If we get here, we've renormalised by one bit (and have already shifted
+  // the mantissa up), and we also know there's no underflow.
+  //
+  // Recombine the sign+exponent with the fraction. We must also decrement the
+  // exponent, to account for the one-bit renormalisation. We do that by using
+  // ASR to shift the mantissa right: its top bit is currently set, so the ASR
+  // effectively puts -1 in the bits that are being added to the exponent.
+  movs    r0, r0, asr #8  // also sets C if we need to round up
+  adc     r0, r0, r2, lsl #23 // recombine, and also do basic rounding
+
+  // If C was not set, then we've rounded down. Therefore, no need to round to
+  // even, and also, no need to compensate for having shifted nonzero bits out
+  // of the subtrahend. We can just return.
+  bxcc    lr
+
+  // If any bit shifted out of the 32-bit output mantissa is nonzero, then we
+  // can also return, because we know we're rounding _up_ (and not to even),
+  // and again, bits shifted out of the subtrahend don't matter because their
+  // combined loss can't exceed the gain from one of these guard bits.
+  tst     r12, #0x3F
+  bxne    lr
+
+  // Otherwise, we must do the full check for round to even.
+  b       LOCAL_LABEL(sub_roundeven)
+
+LOCAL_LABEL(sub_renorm_0):
+  // We come here if no renormalisation is necessary, and therefore also no
+  // underflow can happen.
+  //
+  // Since the leading bit is set, we need to decrement the exponent, to
+  // account for the leading bit adding 1 to it when we recombine.
+  movs    r0, r12, lsr #8  // also sets C if we need to round up
+  sub     r2, r2, #1       // adjust exponent
+  adc     r0, r0, r2, lsl #23 // recombine, and also do basic rounding
+
+  // As in the 1-bit case above, if we didn't round up just now then we're
+  // done, and if any bit shifted out of r12 just now was nonzero then we're
+  // also done.
+  bxcc    lr               // rounding down, done
+  tst     r12, #0x7F
+  bxne    lr               // nonzero guard bit, rounding up, done
+
+  // Otherwise, fall through to the full check for round to even.
+LOCAL_LABEL(sub_roundeven):
+  // Same round-to-even check as in the fadd cases: find all the bits we
+  // shifted out of y's mantissa and see if any are zero.
+  rsb     r3, r3, #32
+  lsls    r1, r1, r3       // set Z if we're rounding to even
+
+  // Unlike the addition case, if we aren't rounding to even then the result is
+  // currently too _big_: the top 32 bits of the output mantissa looked as if
+  // they were on a rounding boundary, but those nonzero bits shifted off the
+  // bottom of the mantissa make the true value slightly smaller than it
+  // looked, so in fact we're just _below_ a rounding boundary. But we've
+  // already rounded it up! So in the non-RTE case we must decrement the
+  // output value.
+  subne   r0, r0, #1       // no RTE, so undo round up
+  biceq   r0, r0, #1       // yes RTE, so clear low bit of output
+  bx      lr
+
+LOCAL_LABEL(sub_renorm_orunder):
+  // We come here if _either_ of these is true:
+  //
+  //  1. we've shifted the output mantissa left by one bit already but its top
+  //     bit is still 0, so we must renormalise by more than 1 bit (and this
+  //     may cause an underflow that we haven't detected yet)
+  //
+  //  2. (Arm only) we have detected an underflow already, not yet shifted the
+  //     output mantissa at all, and haven't yet branched to fsub_underflow.
+
+  // Get the output sign bit by itself in r3. This is needed by the code below,
+  // and also used by fsub_underflow, so if we compute it before the (Arm-only)
+  // branch to fsub_underflow then it doesn't have to be duplicated there.
+  mov     r3, r2, lsr #8   // r3 now has just the output sign, in bit 0
+
+#if !__thumb__
+  // Arm state: we did a combined check for cases 1 and 2 above, so this is
+  // where we separate them and go off to handle underflow in case 2. As stated
+  // above, the Z flag indicates an already-detected underflow.
+  beq     LOCAL_LABEL(sub_underflow)
+#endif
+
+  // Now we know that we must renormalise by at least 2 bits, which may also
+  // give a denormal or zero result.
+  //
+  // This means no rounding can possibly be needed: if the subtraction cleared
+  // the top two bits of the mantissa, it means we computed A-B and found it
+  // was less than A/2, so B > A/2, so the exponent difference was at most 1.
+  // Hence the result mantissa fits in 24 bits even before renormalisation, and
+  // the top bit is clear, so it fits in 23 bits, i.e. it is exact.
+  //
+  // (That argument applies to the result before denormalisation. But any
+  // subtraction delivering a denormal result must also be exact: the inputs to
+  // subtraction are integer multiples of the smallest denormal, hence so is
+  // the result.)
+
+  // Start by shifting up by two bits (we already know the top 2 bits are
+  // clear). In the process, test if the entire mantissa is actually zero.
+  //
+  // If the mantissa is zero, we can safely return +0. (In default IEEE
+  // round-to-nearest mode, the only case of addition/subtraction that delivers
+  // -0 is if you add two zeroes _both_ of which are -0, or the equivalent
+  // subtraction. And those cases won't have come here, because they were
+  // additions of like-signed inputs or subtraction of opposite-signed inputs,
+  // so they go to fadd instead of fsub.)
+  movs    r0, r0, lsr #2
+  bxeq    lr               // result is zero, which r0 already contains
+
+  // Determine how many more bits we need to shift the mantissa up, by counting
+  // its leading zeroes. Adjust the exponent, and shift the mantissa into its
+  // final position (assuming the output is still a normalised number).
+  clz     r12, r0          // compute the shift / exponent adjustment
+  sub     r2, r2, r12      // adjust exponent
+  lsl     r0, r0, r12      // shift mantissa up to the top of the word
+  lsr     r0, r0, #8       // and then down to its final position
+
+  // Check for underflow. This occurs precisely when the adjustment to the
+  // exponent in the bottom 8 bits of r2 carried into its sign bit (because at
+  // the moment the value in r2 is one lower than the true output exponent, so
+  // that adding the leading 1 bit in the mantissa will increment it back to
+  // the correct value). So we can check the sign bit in r2 against the copy of
+  // it we saved in r3 earlier. If no underflow, then we can just recombine the
+  // sign and exponent with the mantissa (no rounding is needed on this branch)
+  // and return.
+  teq     r3, r2, lsr #8      // Exponent underflow?
+  addeq   r0, r0, r2, lsl #23 // if so, trivially put the output back together
+  bxeq    lr                  // and return
+
+  // Now we _have_ underflowed, and the out-of-range exponent stored in the low
+  // 8 bits of r2 tell us by how much: if it's -n, then we need to shift the
+  // normalised mantissa down by n bits. So to make the output denormal, all we
+  // have to do is to shift the mantissa down and recombine it with the
+  // original sign in r3.
+  //
+  // Bit 8 of r2 contains a corrupted version of the sign bit, but we can
+  // safely ignore that, because the semantics of AArch32 register-controlled
+  // shift instructions are that only the low 8 bits of the shift-count
+  // register are examined. So that sign bit is too high up to affect what
+  // happens.
+
+  rsb     r2, r2, #0           // r2 is now the shift count
+LOCAL_LABEL(sub_do_underflow):             // we can also come here from below
+  mov     r0, r0, lsr r2       // shift the mantissa down
+  orr     r0, r0, r3, lsl #31  // put the sign back on
+  bx      lr                   // and return
+
+LOCAL_LABEL(sub_underflow):
+  // We come here if we detected underflow in the 'renormalise by 1 bit' case.
+  // So the input exponent must have been 1, and we shift the mantissa by only
+  // one bit. The only question is whether we put the output sign on: if the
+  // result is actually zero, we don't need to, because a subtraction giving a
+  // zero output always gives +0 (as mentioned above).
+  movs    r0, r12, lsr #8         // Denormalise and check if result is zero
+  bxeq    lr                      // Return +0 if result is zero
+#if __thumb__
+  // Get the output sign in r3. In Arm this was already done just after start
+  // of fsub_renorm_orunder, which all underflows went through. But in Thumb we
+  // might have come straight here without setting up r3.
+  mov     r3, r2, lsr #8
+#endif
+  orr     r0, r0, r3, lsl #31  // put the sign back on
+  bx      lr                   // and return
+
+LOCAL_LABEL(sub_uncommon):
+  // We come here if the entry-point check says that at least one of x and y
+  // has an uncommon (FF or 00) exponent. So we have at least one NaN,
+  // infinity, denormal or zero, but we don't know which, or which operand it's
+  // in. And we could have any combination of those types of input, in _both_
+  // operands.
+
+  // Detect FF exponents (NaNs or infinities) and branch again for those.
+  mov     r12, #0xFF000000
+  bics    r2, r12, r0, lsl #1
+  bicsne  r2, r12, r1, lsl #1
+  beq     LOCAL_LABEL(sub_naninf)
+
+  // Now we know both inputs are finite, but there may be denormals or zeroes.
+  // So it's safe to do the same sign check and cross-jump as we did on the
+  // fast path.
+  teq     r0, r1             // opposite signs?
+  eormi   r1, r1, #1 << 31   // if so, negate the second operand
+  bmi     LOCAL_LABEL(add_zerodenorm)    // and cross-jump to the fadd version of this code
+
+LOCAL_LABEL(sub_zerodenorm):
+  // Now we know x and y have the same sign, and at least one of them is zero
+  // or denormal. If there aren't any zeroes, we'll end up rejoining the fast
+  // path, so we must set up all the same registers, and do our checks for zero
+  // in line with that.
+  //
+  // Start by exactly repeating the initial fast-path setup code: sort into
+  // magnitude order, get the output sign+exponent and the exponent shift.
+  subs    r2, r0, r1         // compare inputs, also keeping x-y
+  eorlo   r2, r2, #1 << 31   // if misordered, flip high bit of difference
+  sublo   r0, r0, r2         // and use that to swap and sign-flip
+  addlo   r1, r1, r2         //   the two inputs
+  mov     r2, r0, lsr #23      // r2 = sign<<8 + exponent
+  sub     r3, r2, r1, lsr #23  // shift = 0..254 (sign bits cancel)
+
+  // Shift y's mantissa up to the top of r1. We know y has exponent 0 (at least
+  // one of the inputs does, and we've sorted them by now). So we definitely
+  // don't need to set the leading bit on y's mantissa; also, if r1 becomes
+  // zero, then we know we're subtracting 0 from x.
+  movs    r1, r1, lsl #8
+  beq     LOCAL_LABEL(sub_yzero)
+
+  // Now we know there aren't any zeroes, and that y is a denormal. x might or
+  // might not be a denormal, so we must check that and decide whether to set
+  // its top mantissa bit.
+  mov     r0, r0, lsl #8       // shift mantissa of x to the top of r0
+  tst     r2, #255             // is x's exponent 0? If so, it's denormal
+  orrne   r0, r0, #1 << 31     // if not, set leading bit of x,
+  subne   r3, r3, #1           //   adjust exponent difference,
+
+  b       LOCAL_LABEL(sub_dosub)
+
+LOCAL_LABEL(sub_yzero):
+  // Here, we know y = 0, so we're subtracting 0 from x. For most values of x,
+  // we return x unchanged: subtracting 0 makes no difference. But if x is
+  // _also_ 0 then we must return +0, rather than whatever x's sign of zero is.
+  // (Because +0 is always the sign of zero you return when subtracting a
+  // number from itself).
+  movs    r12, r0, lsl #1      // test if x = 0 (bottom 31 bits all zero)
+  moveq   r0, #0               // if so, replace x with +0
+  bx      lr
+
+LOCAL_LABEL(sub_naninf):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+  // the input.
+  mov     r12, #0xFF000000
+  cmp     r12, r0, lsl #1          // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+  cmp     r12, r1, lsl #1
+  blo     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // Otherwise, we have no NaNs and at least one infinity, so we're returning
+  // either infinity, or NaN for an (inf-inf) subtraction. We can safely handle
+  // all these cases by flipping the sign of y and going to fadd_inf.
+  eor     r1, r1, #0x80000000
+  b       LOCAL_LABEL(add_inf)
+
+END_COMPILERRT_FUNCTION(__aeabi_fsub)
 
 NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/addsf3.S b/compiler-rt/lib/builtins/arm/thumb1/addsf3.S
new file mode 100644
index 0000000000000..808f154884980
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/addsf3.S
@@ -0,0 +1,888 @@
+//===-- addsf3.S - Add/subtract single precision floating point numbers ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the __addsf3 and __subsf3 functions (single precision
+// floating point number addition and subtraction), with the IEEE-754 default
+// rounding (to nearest, ties to even), for the Thumb1 ISA.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+// General structure of this code:
+//
+// There are three actual entry points here, for addition, subtraction and
+// reversed subtraction (just taking the operands the other way round, so that
+// it returns y-x instead of x-y). But the first thing the functions do (after
+// checking for NaNs) is to sort out whether the magnitudes of the two inputs
+// are being added (x+y with like signs, or x-y with different signs), or
+// subtracted. So fadd jumps across into the middle of fsub if it sees that the
+// signs are different, and vice versa. Then the main code path in fadd handles
+// magnitude addition, and the one in fsub handles magnitude subtraction.
+//
+// NaNs are checked first, so that an input NaN can be propagated exactly,
+// including its sign bit. After ruling out that case, it's safe to flip the
+// sign of one of the inputs, so that during the cross-calls, x - y can be
+// rewritten as x + (-y) and vice versa.
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__addsf3)
+  push {r4, lr}
+  vmov r0, s0
+  vmov r1, s1
+  bl __aeabi_fadd
+  vmov s0, r0
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__addsf3, __aeabi_fadd)
+#endif
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_fadd)
+  push {r4,r5,r6,lr}
+
+  movs    r5, #1
+  lsls    r5, r5, #31  // all cross-branches will expect to have r5==0x80000000
+
+  // Extract the exponents into r2 and r3. In the process, test for all
+  // uncommon values (infinities, NaNs, denormals and zeroes) and branch out of
+  // line if any are found.
+  //
+  // Uncommon operands with exponent 0xFF (NaNs and infinities) "win" over
+  // those with exponent 0 (zeroes and denormals), in the sense that if there's
+  // one of each, the 0xFF one determines the result. But we check for exponent
+  // 0 first, because that way we get it as a by-product of extracting the
+  // exponents in the first place without needing a separate compare
+  // instruction. So the zero/denorm handler will have to finish up the NaN
+  // check as its first task.
+  lsls    r2, r0, #1
+  lsls    r3, r1, #1
+  lsrs    r2, r2, #24
+  beq     LOCAL_LABEL(add_zerodenorm_x)
+  lsrs    r3, r3, #24
+  beq     LOCAL_LABEL(add_zerodenorm_y)
+  cmp     r2, #255
+  beq     LOCAL_LABEL(add_naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(add_naninf)
+
+  // Now we have two normalised numbers. If their signs are opposite, we should
+  // be subtracting their magnitudes rather than adding, so cross-jump to fsub
+  // (via a trampoline that negates y).
+  movs    r4, r0
+  eors    r4, r4, r1         // set N if signs are unequal
+  bmi     LOCAL_LABEL(add_sub)
+LOCAL_LABEL(add_magnitude):
+  // If we get here, we're adding operands with equal signs (i.e. a magnitude
+  // addition). First thing to do is put the operands in magnitude order, so
+  // that x >= y.
+  subs    r4, r0, r1
+  bhs     LOCAL_LABEL(add_swapped)
+  subs    r0, r0, r4
+  adds    r1, r1, r4
+  // We must also swap the pre-extracted exponents here.
+  eors    r2, r2, r3
+  eors    r3, r3, r2
+  eors    r2, r2, r3
+LOCAL_LABEL(add_swapped):
+  // Keep the sign and exponent of the larger input, to use as the sign and
+  // exponent of the output (up to carries and overflows). Also calculate the
+  // exponent difference, which tells us how far we'll need to shift y's
+  // mantissa right to add it to x's.
+  lsrs    r6, r0, #23
+  subs    r3, r2, r3
+
+  // Extract both mantissas, moved up to the top of the word, with the leading
+  // 1 made explicit. We put y's extracted mantissa in a different register
+  // (r4), because we'll want to keep the original y for use in fadd_check_rte.
+  lsls    r0, r0, #8
+  lsls    r4, r1, #8
+  orrs    r0, r0, r5
+  orrs    r4, r4, r5
+
+LOCAL_LABEL(add_doadd):
+  // Here we perform the actual addition. We either fell through from the code
+  // above, or jumped back to here after handling an input denormal.
+  //
+  // We get here with:
+  //   Operands known to be numeric rather than zero/infinity/NaN;
+  //   r0 = mantissa of larger operand (in high 24 bits);
+  //   r4 = mantissa of smaller operand (in high 24 bits);
+  //   r1 = original (or nearly so) smaller operand;
+  //   r6 = result sign and exponent (in low 9 bits);
+  //   r2 = exponent of x
+  //   r3 = exponent difference.
+  //
+  // For normal inputs, the mantissa registers (r0,r4) will have the top bit
+  // set. Denormals will leave that bit clear, treating the number as
+  // 0.[mantissa] x 2^(fixed exponent) instead of renormalising to 1.[mantissa]
+  // x 2^(variable exponent) as a multiplication would want.
+
+  // Actually shift the smaller mantissa downwards and add them together.
+  lsrs    r4, r4, r3
+  adds    r5, r0, r4
+
+  // If that addition carried off the top of r5, then the number has increased
+  // its exponent. Diverge into a completely separate code path for that case,
+  // because there we must check for overflow. We'll return to the label below
+  // if no overflow.
+  bcs     LOCAL_LABEL(add_carry)
+LOCAL_LABEL(add_renormed):
+  // Now we have the output mantissa in r5, with the leading bit at position
+  // 31. The precise sum may be slightly more than that, if r4 != (y << r3).
+  //
+  // Shift the mantissa down to its final position, and use the carry flag (bit
+  // shifted off the bottom) to see if we need to round.
+  lsrs    r0, r5, #8
+  bcc     LOCAL_LABEL(add_rounded)
+
+  // If we fall through to here, then we need to round up, and also check if we
+  // need to round to even. This occurs if all the bits of y's mantissa shifted
+  // off the bottom are zero except for the round bit.
+  //
+  // Some of those bits are in r5 (the 32-bit version of the sum's mantissa).
+  // It's cheap to check those, and should exclude _most_ cases where
+  // round-to-even isn't needed.
+  adds    r0, r0, #1          // simple round up
+  lsls    r5, r5, #(32-7)     // check top 7 bits
+  beq     LOCAL_LABEL(add_check_rte)      // if those are zero, go to full RTE check
+LOCAL_LABEL(add_rounded):
+  // Put the sign+exponent back on. The leading bit of the mantissa increments
+  // the exponent field unwantedly, so we must decrement r6 first to compensate
+  // for that.
+  subs    r6, r6, #1
+  lsls    r6, r6, #23
+  adds    r0, r0, r6
+  // If we haven't overflowed, it's now safe to return.
+  cmp     r2, #255
+  bge     LOCAL_LABEL(add_overflow)
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_overflow):
+  // We have overflow, so we need to return an infinity of the correct sign. r0
+  // already has the correct sign and exponent, so all we need to do is clear
+  // its mantissa.
+  lsrs    r0, r0, #23
+  lsls    r0, r0, #23
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_sub):
+  // We come here when fadd discovered it needed to subtract. Negate the second
+  // operand and cross-jump into fsub.
+  //
+  // The cross-jump is done using BL, for greater branch range. That clobbers
+  // lr, but that's OK, we weren't keeping anything in it at this point.
+  eors    r1, r1, r5
+  bl      LOCAL_LABEL(sub_magnitude)
+
+LOCAL_LABEL(add_carry):
+  // We come here if we carried a 1 bit off the top of r5 where we computed the
+  // sum's mantissa. Shift back down by one and put a 1 bit in at the top.
+  //
+  // That would be easy with the RRX instruction from general AArch32, but we
+  // don't have that here. Instead we OR in a 1 at the bottom, and move it to
+  // the top by rotating right.
+  //
+  // A danger of shifting r5 down by a bit is that we lose the bit at the very
+  // bottom, which might be important if it's the only nonzero bit below the
+  // output mantissa, because then it determines whether we do RTE or not.
+  // Fortunately, another copy of the same bit is still at the bottom of r4
+  // (the shifted version of y's mantissa which we added to x's to make the
+  // version of r5 _before_ we shifted it down). So the full RTE check will
+  // have to remember to check that bit.
+  movs    r0, #1
+  orrs    r5, r5, r0         // set low bit of r5
+  rors    r5, r5, r0         // and rotate right so that's now the high bit
+
+  // Carrying off the top of the mantissa means that the output exponent must
+  // be increased by 1. Increment both copies: the exponent by itself in r2
+  // (used for overflow checking) and the exponent + sign in r6.
+  adds    r2, r2, #1
+  adds    r6, r6, #1
+
+  // Now go back to the common code path for rounding and overflow checking.
+  b       LOCAL_LABEL(add_renormed)
+
+LOCAL_LABEL(add_check_rte):
+  // We come here to do the full (and therefore expensive) check for round-to-
+  // even: is our output number exactly on a rounding boundary, half way
+  // between two representable numbers? That is, of the bits _not_ included in
+  // the output mantissa, is the topmost bit 1 and all the rest 0?
+  //
+  // We only come here at all if we have already rounded the number up. So we
+  // already know the topmost one of the lost bits is 1, and all we have to
+  // check is whether the rest are 0.
+  //
+  // Also, we've already checked all the bits that were still in the 32-bit
+  // version of the output mantissa, so we don't need to check those again ...
+  //
+  // ... well, _nearly_ all, because in the fadd_carry case, we shifted r5 down
+  // by a bit _before_ that check. So we do need to re-check that one bit.
+  //
+  // The basic strategy is: r4 still contains the version of y's mantissa that
+  // we shifted down before adding it to x. And r1 contains more or less the
+  // original version of all of y, including the same mantissa. So if we shift
+  // r4 back up again and XOR it with r1, we clear all the bits that we've
+  // already checked, and leave only the ones we haven't.
+
+  // Start by deliberately throwing away the low bit of r4, in case that
+  // corresponded to the bit we lost off the bottom of r5 in fadd_carry. This
+  // means we won't clear it in the XOR, and therefore, _will_ check it.
+  lsrs    r4, r4, #1
+
+  // Shift r4 back up by the same amount we shifted it down, and shift r1 to
+  // the corresponding position, so that we can XOR them. The most convenient
+  // way to do this is not to modify the variable shift count in r3, and
+  // compensate for it by selecting the shift of r1 appropriately.
+  //
+  // As it happens, we end up with the implicit leading 1 bit of the mantissa
+  // in bit 30 of the result - or rather, it would be if we'd set it, which in
+  // r1 we haven't, because that's still the whole original input float.
+  lsls    r4, r4, r3
+  lsls    r1, r1, #7
+  eors    r1, r1, r4
+
+  // But r1 wasn't just the mantissa of y; it also had the exponent, and its
+  // leading bit was implicit. So the topmost two bits of r1 are useless: in r1
+  // they're part of the exponent field. Exclude them from consideration.
+  //
+  // This doesn't lead to dropping any bit we really care about, because we're
+  // never interested in the actual leading 1 bit of y's mantissa for round-to-
+  // even purposes. Why not? Because we already know the round bit (the one
+  // just off the bottom of the output mantissa) is a 1, which must have come
+  // from y (it's too low down to come from x), and we only care about checking
+  // all the bits below _that_. So y's leading 1 must be at least as high up as
+  // the round bit, and therefore, isn't one of the bits we currently need to
+  // check.
+  lsls    r1, r1, #2
+
+  // Now if all those bits are zero, we're rounding to even. If _not_, we're
+  // finished rounding, so go back to fadd_rounded to continue the main code
+  // path.
+  bne     LOCAL_LABEL(add_rounded)
+
+  // Clear the low bit of the output (rounding to even) and go back to the main
+  // code path.
+  movs    r4, #1
+  bics    r0, r0, r4
+  b       LOCAL_LABEL(add_rounded)
+
+LOCAL_LABEL(add_naninf):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+  // the input.
+  //
+  // On entry, we know r5 = 0x80000000 from the initial uncommon check. Also,
+  // we already extracted the exponents of x and y into r2 and r3.
+  asrs    r4, r5, #7    // so r4 = 0xFF000000
+  lsls    r6, r0, #1    // r6 > r4 iff x is NaN
+  cmp     r6, r4
+  bhi     LOCAL_LABEL(add_nan)
+  lsls    r6, r1, #1    // r6 > r4 iff y is NaN
+  cmp     r6, r4
+  bhi     LOCAL_LABEL(add_nan)
+
+  // No NaNs, so we have at least one infinity. Almost all additions involving
+  // an infinity return the input infinity unchanged. The only exception is if
+  // there are two infinities that have opposite signs (which can happen even
+  // inf fadd, since on this code path we haven't cross-jumped into fsub),
+  // where we return NaN.
+  cmp     r2, r3        // at least one exponent is 0xFF, so if EQ, both are
+  beq     LOCAL_LABEL(add_infinf)   //   and therefore we're adding infinity to infinity
+
+  // With one infinity, we just find which register it's in, and return it.
+  cmp     r2, #255
+  beq     LOCAL_LABEL(add_ret_exact)  // just return x
+LOCAL_LABEL(add_retb): // we reuse this code in the denormal handler
+  movs    r0, r1          // otherwise, return y
+LOCAL_LABEL(add_ret_exact):
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_infinf):
+  // With two infinities, we must check their relative sign. If they're the
+  // same sign, we have no problem.
+  movs    r4, r0
+  eors    r4, r4, r1
+  bpl     LOCAL_LABEL(add_ret_exact)  // identical infinities, so just return one
+
+  // But if we're adding two infinities of opposite sign, make a default quiet
+  // NaN and return that.
+  ldr     r0, =0x7fc00000
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_nan):
+  bl      SYMBOL_NAME(__compiler_rt_fnan2)
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_zerodenorm_x):
+  // We come here if we found x was 0 or a denormal. We haven't set up r3 as
+  // the exponent of y yet.
+  lsrs    r3, r3, #24
+
+  // Also, we checked for zero/denorm before checking for infinities and NaNs.
+  // We know x isn't an infinity or NaN, but we must check y.
+  cmp     r3, #255
+  beq     LOCAL_LABEL(add_naninf)
+
+  // Fall through to the next section. This repeats a pointless check for x
+  // being NaN or infinity, but it would cost more cycles to branch round it.
+
+LOCAL_LABEL(add_zerodenorm_y):
+  // We come here if we found y was 0 or a denormal, but also by falling
+  // through from above. So we may not yet have checked x for infinity/NaN. But
+  // we have checked that y isn't.
+  cmp     r2, #255
+  beq     LOCAL_LABEL(add_naninf)
+
+  // Now at least one of x,y is zero or denormal, and neither is infinite or
+  // NaN. We haven't yet checked the signs and cross-jumped to fsub, but we can
+  // handle all the zero cases without having to:
+  //
+  //  - if x = -y (including both being zero), return 0 of the appropriate sign
+  //  - if x = 0, return y (including the case of same-signed zeroes)
+  //  - if y = 0, return x
+  subs    r6, r0, r1     // are x and y equal
+  cmp     r6, r5         //   except for opposite sign bits? (r5 = 0x80000000)
+  beq     LOCAL_LABEL(add_diffsame)
+  lsls    r6, r1, #1     // is y zero?
+  beq     LOCAL_LABEL(add_ret_exact) // if so, return x
+  lsls    r6, r0, #1     // is x zero?
+  beq     LOCAL_LABEL(add_retb)      // if so, return y
+
+  // Now we've dealt with all the possibilities involving zeroes, so we have
+  // either one denormal or two denormals. These cases are harder, and we don't
+  // want to handle both signs at once, so check the signs and cross-branch
+  // into fsub if they're different.
+  movs    r6, r1
+  eors    r6, r6, r0
+  bpl     LOCAL_LABEL(add_denorm)
+  eors    r1, r1, r5
+  bl      LOCAL_LABEL(sub_denorm)
+LOCAL_LABEL(add_denorm):
+  // Sort the operands into magnitude order. Now we know they have the same
+  // sign, unsigned comparison is good enough for that.
+  subs    r6, r0, r1
+  bhs     0f
+  subs    r0, r0, r6
+  adds    r1, r1, r6
+0:
+
+  // We know one exponent is 0, so check if the other is too. We do this by
+  // adding the two exponents together, achieving two things in one
+  // instruction: it gets the nonzero exponent (if any) into r2 (saving us
+  // swapping r2 with r3 in the sorting step above), and it sets Z if both were
+  // zero.
+  adds    r2, r2, r3
+  beq     LOCAL_LABEL(add_denorm2)
+
+  // Now exactly one operand is denormal, and it's y. We must go back to
+  // fadd_doadd with all the registers appropriately set up.
+  lsrs    r6, r0, #23  // r6 == sign and exponent of x
+  lsls    r4, r1, #8   // r4 == mantissa of y, with leading bit clear
+  lsls    r0, r0, #8
+  orrs    r0, r0, r5   // set high bit on mantissa of x
+  subs    r3, r2, #1   // denormals are shifted as if they had exponent 1
+  b       LOCAL_LABEL(add_doadd)
+
+LOCAL_LABEL(add_diffsame):
+  // Here we only support round-to-nearest mode, so the difference of two
+  // identical things always returns +0.
+  movs    r0, #0
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(add_denorm2):
+  // Here, x,y are both denormal, and we know we're doing magnitude addition.
+  // So we can add the mantissas like ordinary integers, and if they carry into
+  // the exponent, that's still the correct answer. But we have to avoid adding
+  // two copies of the sign bit, so we clear that from y first.
+  bics    r1, r1, r5  // clear sign bit of y
+  adds    r0, r0, r1  // add mantissas
+  pop     {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__aeabi_fadd)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_frsub)
+  // Reversed subtraction, that is, compute y-x, where x is in r0 and y in r1.
+  //
+  // We could implement this by simply swapping r0 with r1. But the point of
+  // having a reversed-subtract in the first place is to avoid the caller
+  // having to do that, so if we do it ourselves, it wastes all the time they
+  // saved. So instead, on the fast path, we redo the sign check our own way
+  // and branch to fadd_magnitude or fsub_magnitude.
+
+  push {r4,r5,r6,lr}
+
+  movs    r5, #1
+  lsls    r5, r5, #31 // all cross-branches will expect to have r5 = 0x80000000
+
+  // Extract the exponents and test for uncommon values. Note that we do the
+  // zero/denormal tests the opposite way round from fsub, because we swap the
+  // operands before branching to the corresponding fsub code, so this way our
+  // first branch will enter fsub with the first of _its_ operands checked.
+  lsls    r2, r0, #1
+  lsls    r3, r1, #1
+  lsrs    r3, r3, #24
+  beq     LOCAL_LABEL(rsb_zerodenorm_y)
+  lsrs    r2, r2, #24
+  beq     LOCAL_LABEL(rsb_zerodenorm_x)
+  cmp     r2, #255
+  beq     LOCAL_LABEL(rsb_naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(rsb_naninf)
+
+  // Decide which of fadd_magnitude and fsub_magnitude to branch to, and do so.
+  eors    r0, r0, r5
+  movs    r4, r0
+  eors    r4, r4, r1
+  bpl     LOCAL_LABEL(rsb_add)
+  eors    r1, r1, r5
+  bl      LOCAL_LABEL(sub_magnitude)
+LOCAL_LABEL(rsb_add):
+  bl      LOCAL_LABEL(add_magnitude)
+
+  // Any uncommon operands to frsub are handled by just swapping the two
+  // operands and going to fsub's handler. We're off the main fast path now, so
+  // there's no need to try to optimise it any harder.
+LOCAL_LABEL(rsb_zerodenorm_y):
+  push    {r0,r2}
+  push    {r1,r3}
+  pop     {r0,r2}
+  pop     {r1,r3}
+  bl      LOCAL_LABEL(sub_zerodenorm_x)  // we just swapped x and y, so now x is 0/denorm
+LOCAL_LABEL(rsb_zerodenorm_x):
+  push    {r0,r2}
+  push    {r1,r3}
+  pop     {r0,r2}
+  pop     {r1,r3}
+  bl      LOCAL_LABEL(sub_zerodenorm_y)  // similarly, now we know y is
+LOCAL_LABEL(rsb_naninf):
+  push    {r0,r2}
+  push    {r1,r3}
+  pop     {r0,r2}
+  pop     {r1,r3}
+  bl      LOCAL_LABEL(sub_naninf)
+
+END_COMPILERRT_FUNCTION(__aeabi_frsub)
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__subsf3)
+  push {r4, lr}
+  vmov r0, s0
+  vmov r1, s1
+  bl __aeabi_fsub
+  vmov s0, r0
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__subsf3, __aeabi_fsub)
+#endif
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_fsub)
+  // Main entry point for subtraction.
+  push {r4,r5,r6,lr}
+
+  movs    r5, #1
+  lsls    r5, r5, #31
+
+  // Extract the exponents into r2 and r3 and test for all uncommon values,
+  // similarly to fadd.
+  lsls    r2, r0, #1
+  lsls    r3, r1, #1
+  lsrs    r2, r2, #24
+  beq     LOCAL_LABEL(sub_zerodenorm_x)
+  lsrs    r3, r3, #24
+  beq     LOCAL_LABEL(sub_zerodenorm_y)
+  cmp     r2, #255
+  beq     LOCAL_LABEL(sub_naninf)
+  cmp     r3, #255
+  beq     LOCAL_LABEL(sub_naninf)
+
+  // Check the signs, and if they're unequal, cross-jump into fadd to do
+  // magnitude addition. (Now we've excluded NaNs, it's safe to flip the sign
+  // of y.)
+  movs    r4, r0
+  eors    r4, r4, r1
+  bmi     LOCAL_LABEL(sub_add)
+LOCAL_LABEL(sub_magnitude):
+  // If we get here, we're subtracting operands with equal signs (i.e. a
+  // magnitude subtraction). First thing to do is put operands in magnitude
+  // order, so that x >= y. However, if they are swapped, we must also negate
+  // both of them, since A - B = (-B) - (-A).
+  subs    r4, r0, r1
+  bhs     LOCAL_LABEL(sub_swapped)
+  eors    r4, r4, r5
+  subs    r0, r0, r4
+  adds    r1, r1, r4
+  // We must also swap the pre-extracted exponents here.
+  eors    r2, r2, r3
+  eors    r3, r3, r2
+  eors    r2, r2, r3
+LOCAL_LABEL(sub_swapped):
+  // Save the sign and exponent of the larger operand to use for the result (up
+  // to renormalisation), and calculate the exponent difference for shifting
+  // one mantissa relative to the other.
+  lsrs    r6, r0, #23
+  subs    r3, r2, r3
+
+  // Shift the mantissas up to the top of the words. In the process we put y's
+  // shifted mantissa into a separate register, keeping the original for later
+  // reference. Also, although we set the leading bit of y, we _clear_ the
+  // leading bit of x, which is just as quick and saves us having to decrement
+  // the output exponent later to compensate.
+  lsls    r0, r0, #8
+  lsls    r4, r1, #8
+  bics    r0, r0, r5
+  orrs    r4, r4, r5
+
+LOCAL_LABEL(sub_dosub): // we may come back here after sorting out denorms
+
+  // We get here with:
+  //   Operands known to be numeric rather than zero/infinity/NaN;
+  //   r0 = mantissa of larger operand (in top 24 bits, with high bit clear)
+  //   r4 = mantissa of smaller operand (in top 24 bits, with high bit set)
+  //   r1 = original smaller operand (up to maybe a sign flip)
+  //   r6 = result sign/exponent (in low 9 bits)
+  //   r2 = plain result exponent (in low 8 bits, i.e. r6 & 0xFF)
+  //   r3 = exponent difference.
+  //
+  // Begin calculating the output mantissa by shifting y's mantissa right and
+  // subtracting. This may leave the mantissa too large by one, if the bits
+  // shifted out of y are nonzero. We correct this during rounding if
+  // necessary.
+  lsrs    r4, r4, r3
+  subs    r5, r0, r4
+
+  // This may have cleared the high bit of the output mantissa, in which case
+  // we must renormalise. Our strategy is to split into three code paths, on
+  // two of which an awkward case is known not to arise:
+  //  * no need to renormalise at all => underflow can't happen
+  //  * shift up by exactly 1 bit
+  //  * shift up by more than 1 bit => rounding can't happen (result is exact)
+  //
+  // First branch out of line for the first case, which we can detect because
+  // the N flag tells us whether the top mantissa bit is still set.
+  bpl     LOCAL_LABEL(sub_renormed)
+
+  // Renormalise by one bit, and check the new top bit to see if we need to
+  // renormalise by more than that.
+  lsls    r5, r5, #1
+  bpl     LOCAL_LABEL(sub_renorm_big) // if new top bit still clear, renormalise by more
+  // Decrement both exponent registers (r6 with the sign, r2 without). We
+  // decrement r6 by 2 instead of 1, because now the output mantissa has the
+  // top bit set, so we must compensate when we put the sign and exponent back
+  // on.
+  //
+  // The extra decrement of r6 might carry into the sign bit. This doesn't
+  // matter on the fast path, because the leading bit in the mantissa will undo
+  // it. But we need to account for it in the underflow handler for this path.
+  subs    r6, r6, #2
+  subs    r2, r2, #1
+  // The decrement of the pure exponent value also doubles as a check for
+  // underflow, because we underflowed precisely if the exponent went to 0.
+  beq     LOCAL_LABEL(sub_underflow_1)
+LOCAL_LABEL(sub_renormed):
+  // Now we have the output mantissa in r5. It may or may not have the high bit
+  // set, depending on which branch of the code we've come through. But r6 has
+  // been adjusted appropriately, so that we can make a basically right output
+  // value (before rounding) by adding r6 << 23 to r5 >> 8.
+  //
+  // If any nonzero bits were shifted off the bottom of y, then the true value
+  // of the output mantissa might be slightly _less_ than the value in r5.
+  // However the maximum difference is about 2^{-7} ULP relative to the final
+  // result (because it's at most one ULP of the 32-bit output mantissa in r5).
+  // So it doesn't affect the result in round-to-nearest mode unless it puts us
+  // just below a rounding boundary, which means we can ignore it until the
+  // full round-to-even check.
+  lsls    r6, r6, #23  // prepare sign and exponent
+  lsrs    r0, r5, #8   // shift down, and put the round bit into C
+  bcs     LOCAL_LABEL(sub_round)   // diverge based on round bit
+  // If the round bit shifted off the bottom of r5 was clear, then we're not
+  // rounding up, so we can make the output value and finish immediately.
+  adds    r0, r0, r6   // reconstitute output value without rounding
+  pop     {r4,r5,r6,pc}
+LOCAL_LABEL(sub_round):
+  // Otherwise, we're rounding, in three stages. First round up; then cheaply
+  // check the low bits of r5 (the 32-bit version of the mantissa) so that we
+  // can rule out round-to-even if any of those is nonzero; finally, in as few
+  // cases as possible, check the rest of y's mantissa to check for RTE fully.
+  adcs    r0, r0, r6      // reconstitute output value while rounding up
+  lsls    r5, r5, #(32-7) // check first 7 guard bits
+  beq     LOCAL_LABEL(sub_check_rte)  // if the're all 0, do the full check for RTE
+  pop     {r4,r5,r6,pc}   // otherwise we're done
+
+LOCAL_LABEL(sub_add):
+  // Trampoline to cross-jump to fadd, because a 16-bit branch won't reach that
+  // far. Also a convenient place to flip y's sign, so we only have to do it
+  // once.
+  eors    r1, r1, r5      // we know r5 = 0x80000000
+  bl      LOCAL_LABEL(add_magnitude)  // clobbers lr, which doesn't matter
+
+LOCAL_LABEL(sub_check_rte):
+  // Full check for round-to-even, in the same style as fadd_check_rte: r4
+  // still contains the version of y's mantissa that we shifted down before
+  // subtracting from x, and r1 contains the original version of that mantissa.
+  // So if we shift r4 back up again and XOR it with r1, we clear all the bits
+  // that we've already checked, and leave only the ones we haven't. The only
+  // exception is the leading mantissa bit, which is implicit in r1, but this
+  // can never affect round-to-even, because if we rounded at all then the
+  // round bit must have come from y, so the leading bit of y is at the round
+  // bit or above, hence not one of the bits we're checking for RTE.
+  lsls    r4, r4, r3  // undo the shift of y's mantissa
+  lsls    r1, r1, #8  // shift y's original mantissa back to the same place
+  eors    r1, r1, r4  // find any differences
+  lsls    r1, r1, #1  // but ignore the leading mantissa bit
+  beq     LOCAL_LABEL(sub_rte)    // if all bits now clear, we're rounding to even
+
+  // If we're not RTEing, we must undo the simplistic rounding we've already
+  // done. (We incremented the result based on the belief that the shifted-off
+  // data started 0x80xxx, but it turns out that xxx is slightly negative, so
+  // actually we had 0x7Fyyy.)
+  subs    r0, r0, #1
+  pop     {r4,r5,r6,pc}
+LOCAL_LABEL(sub_rte):
+  // Actually round to even, by clearing the low bit of the output.
+  movs    r4, #1
+  bics    r0, r0, r4
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_renorm_big):
+  // Now we know that we must renormalise by at least 2 bits, which may also
+  // give a denormal or zero result.
+  //
+  // This means no rounding can possibly be needed: if the subtraction cleared
+  // the top two bits of the mantissa, it means we computed A-B and found it
+  // was less than A/2, so B > A/2, so the exponent difference was at most 1.
+  // Hence the result mantissa fits in 24 bits even before renormalisation, and
+  // the top bit is clear, so it fits in 23 bits, i.e. it is exact.
+
+  // Detect an actual zero result, and go and return it.
+  beq     LOCAL_LABEL(sub_diffsame)
+
+  // Renormalise by binary search. (16-bit Thumb has no CLZ instruction.) We'll
+  // accumulate the total exponent adjustment in r0. It starts at 1 rather than
+  // 0, because we've shifted the mantissa left by one bit already.
+  movs    r0, #1
+
+  // If the top 16 bits of r5 are clear, shift up by 16 and adjust r0 to match.
+  lsrs    r3, r5, #(32-16)
+  bne     0f
+  lsls    r5, r5, #16
+  adds    r0, r0, #16
+0:
+  // Same for 8 bits
+  lsrs    r3, r5, #(32-8)
+  bne     0f
+  lsls    r5, r5, #8
+  adds    r0, r0, #8
+0:
+  // 4 bits
+  lsrs    r3, r5, #(32-4)
+  bne     0f
+  lsls    r5, r5, #4
+  adds    r0, r0, #4
+0:
+  // 2 bits
+  lsrs    r3, r5, #(32-2)
+  bne     0f
+  lsls    r5, r5, #2
+  adds    r0, r0, #2
+0:
+  // 1 bit
+  lsrs    r3, r5, #(32-1)
+  bne     0f
+  lsls    r5, r5, #1
+  adds    r0, r0, #1
+0:
+
+  // Update our two copies of the exponent (with sign in r6, without in r2).
+  subs    r6, r6, r0
+  subs    r2, r2, r0
+  // Shift the mantissa and exponent into the right places to combine them.
+  lsls    r4, r5, #1              // clear leading bit of mantissa
+  lsrs    r0, r4, #9              // and shift it down
+  lsls    r4, r6, #23             // shift sign and exponent up
+  adds    r0, r0, r4              // put them together
+  // Check for underflow, which occurs if the output exponent is less than 1
+  // (including having gone negative).
+  cmp     r2, #1
+  blt     LOCAL_LABEL(sub_underflow_2)
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_diffsame):
+  // Here we only support round-to-nearest mode, so the difference of two
+  // identical things always returns +0.
+  movs    r0, #0
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_underflow_1):
+  // We come here if renormalising by one bit reduced the output exponent to
+  // zero. In other words, the output value in x is denormal (hence exact) and
+  // wants shifting down by exactly 9 bits (8 bits of exponent plus the bit we
+  // already shifted it by), and then the sign bit putting back on.
+  //
+  // Also, before we get the sign bit from r6, we must add 1 to it, because of
+  // the possibility that decrementing it carried into the sign bit.
+  adds    r6, r6, #1    // undo potential sign-flipping carry
+  lsrs    r6, r6, #8    // isolate the sign bit
+  lsls    r6, r6, #31   // and shift it up to the top
+  lsrs    r0, r5, #9    // construct the output mantissa
+  orrs    r0, r0, r6    // and combine with the sign bit
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_underflow_2):
+  // We come here if multi-bit renormalisation found a denormal. The mantissa
+  // has its leading bit set at the top of r5, so it needs shifting down 8 bits
+  // to where it would be in a normalised number, and then further: if the
+  // output exponent is 0 (meaning the exponent just below a normalised number)
+  // then we shift one extra bit, if it's -1 then we shift two extra bits, and
+  // so on. So in total we shift down by 8 + (1 - exp) = 9 - exp.
+  rsbs    r4, r6, #0
+  adds    r4, r4, #9
+  lsrs    r5, r5, r4    // shift mantissa into place
+
+  // Extract the sign bit from r6 and combine it with that denormal. r6 could
+  // be 0 or could be negative, so we must add enough to it to make it reliably
+  // positive. Any offset that works is fine; we'll use 0xc0, which is the
+  // offset used by IEEE 754:1985 underflow intermediate values.
+  adds    r6, r6, #0xc0 // rebias to correct sign bit
+  lsrs    r6, r6, #8    // isolate the sign bit
+  lsls    r0, r6, #31   // and shift it up to the top
+  adds    r0, r0, r5    // combine with the denormalised mantissa
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_naninf):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to __fnan2 which will propagate a NaN from
+  // the input.
+  // We come here if at least one of x,y is a NaN or infinity.
+  // Their exponents are reliably always in r2 and r3
+  // respectively.
+  asrs    r4, r5, #7    // so r4 = 0xFF000000
+  lsls    r6, r0, #1    // r6 > r4 iff x is NaN
+  cmp     r6, r4
+  bhi     LOCAL_LABEL(sub_nan)
+  lsls    r6, r1, #1    // r6 > r4 iff y is NaN
+  cmp     r6, r4
+  bhi     LOCAL_LABEL(sub_nan)
+
+  // No NaNs, so we have at least one infinity. Almost all additions involving
+  // an infinity return the input infinity unchanged. The only exception is
+  // subtracting two infinities that have the same sign, where we return NaN.
+  cmp     r2, r3        // at least one exponent is 0xFF, so if EQ, both are
+  beq     LOCAL_LABEL(sub_infinf)
+
+  // If x is infinite and y is finite, return x.
+  cmp     r2, #255
+  beq     LOCAL_LABEL(sub_ret_exact)
+LOCAL_LABEL(sub_retminusy):
+  // If x is finite and y is infinite, return -y.
+  movs    r0, r1
+  eors    r0, r0, r5    // negate y
+LOCAL_LABEL(sub_retx):
+LOCAL_LABEL(sub_ret_exact):
+  pop     {r4,r5,r6,pc}
+LOCAL_LABEL(sub_infinf):
+  // With two infinities, we must check their relative sign. If they have
+  // opposite sign, we just return x (which is the one with the same sign as
+  // the output).
+  movs    r4, r0
+  eors    r4, r4, r1
+  bmi     LOCAL_LABEL(sub_ret_exact)
+
+  // But if we're subtracting two infinities of the same sign, make a default
+  // quiet NaN and return that.
+  ldr     r0, =0x7fc00000
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_nan):
+  bl      SYMBOL_NAME(__compiler_rt_fnan2)
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(sub_zerodenorm_x):
+  // We come here if we found x was 0 or a denormal. We haven't set up r3 as
+  // the exponent of y yet.
+  lsrs    r3, r3, #24
+
+  // Also, we checked for zero/denorm before checking for infinities and NaNs.
+  // We know x isn't an infinity or NaN, but we must check y.
+  cmp     r3, #255
+  beq     LOCAL_LABEL(sub_naninf)
+
+  // Fall through to the next section. This repeats a pointless check for x
+  // being NaN or infinity, but it would cost more cycles to branch round it.
+
+LOCAL_LABEL(sub_zerodenorm_y):
+  // We come here if we found y was 0 or a denormal, but also by falling
+  // through from above. So we may not yet have checked x for infinity/NaN. But
+  // we have checked that y isn't.
+  cmp     r2, #255
+  beq     LOCAL_LABEL(sub_naninf)
+
+  // Now at least one of x,y is zero or denormal, and neither is infinite or
+  // NaN. We haven't yet checked the signs and cross-jumped to fsub, but we can
+  // handle all the zero cases without having to:
+  //
+  //  - if x = -y (including both being zero), return 0 of the appropriate sign
+  //  - if y = 0, return x (including the case of oppositely signed zeroes)
+  //  - if x = 0 and y != 0, return -y
+  cmp     r0, r1         // are x and y equal?
+  beq     LOCAL_LABEL(sub_diffsame)
+  lsls    r6, r1, #1     // is y zero?
+  beq     LOCAL_LABEL(sub_retx)      // if so, return x
+  lsls    r6, r0, #1     // is x zero?
+  beq     LOCAL_LABEL(sub_retminusy) // if so, return -y
+
+  // Now we've dealt with all the possibilities involving zeroes, so we have
+  // either one denormal or two denormals. These cases are harder, and we don't
+  // want to handle both signs at once, so check the signs and cross-branch
+  // into fadd if they're different.
+  movs    r6, r1
+  eors    r6, r6, r0
+  bpl     LOCAL_LABEL(sub_denorm)
+  eors    r1, r1, r5
+  bl      LOCAL_LABEL(add_denorm)
+LOCAL_LABEL(sub_denorm):
+  // Sort the operands into magnitude order. Now we know they have the same
+  // sign, unsigned comparison is good enough for that.
+  subs    r6, r0, r1
+  bhs     0f
+  eors    r6, r6, r5              // flip the signs in the process
+  subs    r0, r0, r6
+  adds    r1, r1, r6
+0:
+
+  // We know one exponent is 0, so check if the other is too. We do this by
+  // adding the two exponents together, achieving two things in one
+  // instruction: it gets the nonzero exponent (if any) into r2 (saving us
+  // swapping r2 with r3 in the sorting step above), and it sets Z if both were
+  // zero.
+  adds    r2, r2, r3
+  beq     LOCAL_LABEL(sub_denorm2)
+
+  // Now exactly one operand is denormal, and it's y. We must go back to
+  // fsub_dosub with all the registers appropriately set up.
+  lsrs    r6, r0, #23  // r6 == sign and exponent of x
+  lsls    r4, r1, #8   // r4 == mantissa of y, with leading bit clear
+  lsls    r0, r0, #8
+  bics    r0, r0, r5   // clear high bit on mantissa of x
+  subs    r3, r2, #1   // denormals are shifted as if they had exponent 1
+  b       LOCAL_LABEL(sub_dosub)
+
+LOCAL_LABEL(sub_denorm2):
+  // Here, x,y are both denormal, and we know we're doing magnitude addition.
+  // So we can subtract the mantissas like ordinary integers. But we have to
+  // avoid subtracting y's sign bit from x's.
+  bics    r1, r1, r5  // clear sign bit of y
+  subs    r0, r0, r1  // subtract mantissas
+  pop     {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__aeabi_fsub)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/Unit/addsf3_test.c b/compiler-rt/test/builtins/Unit/addsf3_test.c
new file mode 100644
index 0000000000000..a08ba8b91056a
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/addsf3_test.c
@@ -0,0 +1,384 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_addsf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm assembler FP implementations, which commit to a
+// more detailed handling of NaNs, we tighten up the check and include some
+// extra test cases specific to that NaN policy.
+#if __arm__ || __thumb__
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a + b
+COMPILER_RT_ABI float __addsf3(float a, float b);
+
+int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __addsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) != expected_rep;
+#else
+  int ret = compareResultF(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error at line %d: __addsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           line, a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
+}
+
+#define test__addsf3(a,b,x) (test__addsf3)(__LINE__,a,b,x)
+
+int main() {
+  int status = 0;
+
+  status |= test__addsf3(0x00000000, 0x00000000, 0x00000000);
+  status |= test__addsf3(0x00000000, 0x007fffff, 0x007fffff);
+  status |= test__addsf3(0x00000000, 0x3f800000, 0x3f800000);
+  status |= test__addsf3(0x00000000, 0x7f000000, 0x7f000000);
+  status |= test__addsf3(0x00000000, 0x7f800000, 0x7f800000);
+  status |= test__addsf3(0x00000000, 0x80000000, 0x00000000);
+  status |= test__addsf3(0x00000000, 0x807fffff, 0x807fffff);
+  status |= test__addsf3(0x00000000, 0x80800000, 0x80800000);
+  status |= test__addsf3(0x00000000, 0xff800000, 0xff800000);
+  status |= test__addsf3(0x00000001, 0x00000001, 0x00000002);
+  status |= test__addsf3(0x00000001, 0x3f7fffff, 0x3f7fffff);
+  status |= test__addsf3(0x00000001, 0x3f800000, 0x3f800000);
+  status |= test__addsf3(0x00000001, 0x3ffffffe, 0x3ffffffe);
+  status |= test__addsf3(0x00000001, 0x3fffffff, 0x3fffffff);
+  status |= test__addsf3(0x00000001, 0x7effffff, 0x7effffff);
+  status |= test__addsf3(0x00000001, 0x7f000000, 0x7f000000);
+  status |= test__addsf3(0x00000001, 0x7f7ffffe, 0x7f7ffffe);
+  status |= test__addsf3(0x00000001, 0x7f7fffff, 0x7f7fffff);
+  status |= test__addsf3(0x00000001, 0x80000001, 0x00000000);
+  status |= test__addsf3(0x00000002, 0x80000001, 0x00000001);
+  status |= test__addsf3(0x00000003, 0x00000000, 0x00000003);
+  status |= test__addsf3(0x00000003, 0x7f800000, 0x7f800000);
+  status |= test__addsf3(0x00000003, 0x80000000, 0x00000003);
+  status |= test__addsf3(0x00000003, 0x80000002, 0x00000001);
+  status |= test__addsf3(0x00000003, 0xc0a00000, 0xc0a00000);
+  status |= test__addsf3(0x00000003, 0xff000000, 0xff000000);
+  status |= test__addsf3(0x00000003, 0xff800000, 0xff800000);
+  status |= test__addsf3(0x00000004, 0x00000004, 0x00000008);
+  status |= test__addsf3(0x007ffffc, 0x807ffffc, 0x00000000);
+  status |= test__addsf3(0x007ffffd, 0x807ffffe, 0x80000001);
+  status |= test__addsf3(0x007fffff, 0x007fffff, 0x00fffffe);
+  status |= test__addsf3(0x007fffff, 0x807ffffe, 0x00000001);
+  status |= test__addsf3(0x007fffff, 0x80800000, 0x80000001);
+  status |= test__addsf3(0x00800000, 0x00000000, 0x00800000);
+  status |= test__addsf3(0x00800000, 0x00800000, 0x01000000);
+  status |= test__addsf3(0x00800000, 0x80800000, 0x00000000);
+  status |= test__addsf3(0x00800001, 0x80800000, 0x00000001);
+  status |= test__addsf3(0x00800001, 0x80800002, 0x80000001);
+  status |= test__addsf3(0x00ffffff, 0x81000000, 0x80000001);
+  status |= test__addsf3(0x00ffffff, 0x81000002, 0x80000005);
+  status |= test__addsf3(0x00ffffff, 0x81000004, 0x80000009);
+  status |= test__addsf3(0x01000000, 0x80ffffff, 0x00000001);
+  status |= test__addsf3(0x01000001, 0x80800001, 0x00800001);
+  status |= test__addsf3(0x01000001, 0x80ffffff, 0x00000003);
+  status |= test__addsf3(0x01000002, 0x80800001, 0x00800003);
+  status |= test__addsf3(0x017fffff, 0x81800000, 0x80000002);
+  status |= test__addsf3(0x01800000, 0x817fffff, 0x00000002);
+  status |= test__addsf3(0x01800001, 0x817fffff, 0x00000006);
+  status |= test__addsf3(0x01800002, 0x81000003, 0x01000001);
+  status |= test__addsf3(0x3f7fffff, 0x80000001, 0x3f7fffff);
+  status |= test__addsf3(0x3f800000, 0x3f800000, 0x40000000);
+  status |= test__addsf3(0x3f800000, 0x3f800003, 0x40000002);
+  status |= test__addsf3(0x3f800000, 0x40000000, 0x40400000);
+  status |= test__addsf3(0x3f800000, 0x40e00000, 0x41000000);
+  status |= test__addsf3(0x3f800000, 0x80000000, 0x3f800000);
+  status |= test__addsf3(0x3f800000, 0xbf800000, 0x00000000);
+  status |= test__addsf3(0x3f800001, 0x3f800000, 0x40000000);
+  status |= test__addsf3(0x3f800001, 0xbf800000, 0x34000000);
+  status |= test__addsf3(0x3f800001, 0xbf800002, 0xb4000000);
+  status |= test__addsf3(0x3ffffffc, 0xbffffffd, 0xb4000000);
+  status |= test__addsf3(0x3fffffff, 0xc0000000, 0xb4000000);
+  status |= test__addsf3(0x40000000, 0x34000000, 0x40000000);
+  status |= test__addsf3(0x40000000, 0x3f800000, 0x40400000);
+  status |= test__addsf3(0x40000000, 0x40000000, 0x40800000);
+  status |= test__addsf3(0x40000000, 0x40000001, 0x40800000);
+  status |= test__addsf3(0x40000000, 0xbfffffff, 0x34000000);
+  status |= test__addsf3(0x40000000, 0xc0000000, 0x00000000);
+  status |= test__addsf3(0x40000000, 0xc0000001, 0xb4800000);
+  status |= test__addsf3(0x40000000, 0xc0a00000, 0xc0400000);
+  status |= test__addsf3(0x40000001, 0x34000000, 0x40000002);
+  status |= test__addsf3(0x40000001, 0x40000002, 0x40800002);
+  status |= test__addsf3(0x40000001, 0xbf800001, 0x3f800001);
+  status |= test__addsf3(0x40000002, 0xbf800001, 0x3f800003);
+  status |= test__addsf3(0x40000002, 0xbf800003, 0x3f800001);
+  status |= test__addsf3(0x40000004, 0xc0000003, 0x34800000);
+  status |= test__addsf3(0x40400000, 0x40400000, 0x40c00000);
+  status |= test__addsf3(0x407fffff, 0x33ffffff, 0x407fffff);
+  status |= test__addsf3(0x407fffff, 0x34000000, 0x40800000);
+  status |= test__addsf3(0x407fffff, 0xc07ffffe, 0x34800000);
+  status |= test__addsf3(0x407fffff, 0xc0800002, 0xb5a00000);
+  status |= test__addsf3(0x40800001, 0xc07fffff, 0x35400000);
+  status |= test__addsf3(0x40a00000, 0x00000000, 0x40a00000);
+  status |= test__addsf3(0x40a00000, 0x80000000, 0x40a00000);
+  status |= test__addsf3(0x40a00000, 0xbf800000, 0x40800000);
+  status |= test__addsf3(0x40a00000, 0xc0a00000, 0x00000000);
+  status |= test__addsf3(0x7d800001, 0xfd7fffff, 0x72400000);
+  status |= test__addsf3(0x7e7fffff, 0xfe7ffffe, 0x72800000);
+  status |= test__addsf3(0x7e7fffff, 0xfe800002, 0xf3a00000);
+  status |= test__addsf3(0x7e800000, 0x7e800000, 0x7f000000);
+  status |= test__addsf3(0x7e800000, 0xfe7fffff, 0x72800000);
+  status |= test__addsf3(0x7e800000, 0xfe800001, 0xf3000000);
+  status |= test__addsf3(0x7e800001, 0x7e800000, 0x7f000000);
+  status |= test__addsf3(0x7e800001, 0xff000001, 0xfe800001);
+  status |= test__addsf3(0x7e800002, 0xfe000003, 0x7e000001);
+  status |= test__addsf3(0x7e800004, 0xfe800003, 0x73000000);
+  status |= test__addsf3(0x7efffffe, 0x7efffffe, 0x7f7ffffe);
+  status |= test__addsf3(0x7efffffe, 0x7effffff, 0x7f7ffffe);
+  status |= test__addsf3(0x7effffff, 0x3f800000, 0x7effffff);
+  status |= test__addsf3(0x7effffff, 0x7f000000, 0x7f800000);
+  status |= test__addsf3(0x7effffff, 0xbf800000, 0x7effffff);
+  status |= test__addsf3(0x7effffff, 0xff000000, 0xf3000000);
+  status |= test__addsf3(0x7f000000, 0x3f800000, 0x7f000000);
+  status |= test__addsf3(0x7f000000, 0x7f000000, 0x7f800000);
+  status |= test__addsf3(0x7f000000, 0x7f800000, 0x7f800000);
+  status |= test__addsf3(0x7f000000, 0xbf800000, 0x7f000000);
+  status |= test__addsf3(0x7f000000, 0xff000000, 0x00000000);
+  status |= test__addsf3(0x7f000000, 0xff800000, 0xff800000);
+  status |= test__addsf3(0x7f000001, 0x7f000000, 0x7f800000);
+  status |= test__addsf3(0x7f000001, 0xff000000, 0x73800000);
+  status |= test__addsf3(0x7f000001, 0xff000002, 0xf3800000);
+  status |= test__addsf3(0x7f000002, 0xfe800001, 0x7e800003);
+  status |= test__addsf3(0x7f7ffffe, 0x3f800000, 0x7f7ffffe);
+  status |= test__addsf3(0x7f7ffffe, 0x7f7ffffe, 0x7f800000);
+  status |= test__addsf3(0x7f7ffffe, 0x7f7fffff, 0x7f800000);
+  status |= test__addsf3(0x7f7ffffe, 0xbf800000, 0x7f7ffffe);
+  status |= test__addsf3(0x7f7ffffe, 0xff7fffff, 0xf3800000);
+  status |= test__addsf3(0x7f7fffff, 0x3f800000, 0x7f7fffff);
+  status |= test__addsf3(0x7f7fffff, 0x80000001, 0x7f7fffff);
+  status |= test__addsf3(0x7f7fffff, 0xbf800000, 0x7f7fffff);
+  status |= test__addsf3(0x7f7fffff, 0xff7fffff, 0x00000000);
+  status |= test__addsf3(0x7f800000, 0x00000000, 0x7f800000);
+  status |= test__addsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__addsf3(0x7f800000, 0x7f000000, 0x7f800000);
+  status |= test__addsf3(0x7f800000, 0x7f800000, 0x7f800000);
+  status |= test__addsf3(0x7f800000, 0x80000000, 0x7f800000);
+  status |= test__addsf3(0x7f800000, 0x807fffff, 0x7f800000);
+  status |= test__addsf3(0x7f800000, 0xff000000, 0x7f800000);
+  status |= test__addsf3(0x80000000, 0x00000000, 0x00000000);
+  status |= test__addsf3(0x80000000, 0x007fffff, 0x007fffff);
+  status |= test__addsf3(0x80000000, 0x7f000000, 0x7f000000);
+  status |= test__addsf3(0x80000000, 0x7f800000, 0x7f800000);
+  status |= test__addsf3(0x80000000, 0x80000000, 0x80000000);
+  status |= test__addsf3(0x80000000, 0x807fffff, 0x807fffff);
+  status |= test__addsf3(0x80000000, 0x80800000, 0x80800000);
+  status |= test__addsf3(0x80000000, 0xbf800000, 0xbf800000);
+  status |= test__addsf3(0x80000000, 0xff800000, 0xff800000);
+  status |= test__addsf3(0x80000001, 0x00000001, 0x00000000);
+  status |= test__addsf3(0x80000001, 0x80000001, 0x80000002);
+  status |= test__addsf3(0x80000001, 0xbf7fffff, 0xbf7fffff);
+  status |= test__addsf3(0x80000001, 0xbf800000, 0xbf800000);
+  status |= test__addsf3(0x80000001, 0xbffffffe, 0xbffffffe);
+  status |= test__addsf3(0x80000001, 0xbfffffff, 0xbfffffff);
+  status |= test__addsf3(0x80000001, 0xfeffffff, 0xfeffffff);
+  status |= test__addsf3(0x80000001, 0xff000000, 0xff000000);
+  status |= test__addsf3(0x80000001, 0xff7ffffe, 0xff7ffffe);
+  status |= test__addsf3(0x80000001, 0xff7fffff, 0xff7fffff);
+  status |= test__addsf3(0x80000002, 0x00000001, 0x80000001);
+  status |= test__addsf3(0x80000003, 0x00000000, 0x80000003);
+  status |= test__addsf3(0x80000003, 0x00000002, 0x80000001);
+  status |= test__addsf3(0x80000003, 0x40400000, 0x40400000);
+  status |= test__addsf3(0x80000003, 0x7f000000, 0x7f000000);
+  status |= test__addsf3(0x80000003, 0x7f800000, 0x7f800000);
+  status |= test__addsf3(0x80000003, 0x80000000, 0x80000003);
+  status |= test__addsf3(0x80000003, 0xff800000, 0xff800000);
+  status |= test__addsf3(0x80000004, 0x80000004, 0x80000008);
+  status |= test__addsf3(0x807ffffd, 0x007ffffe, 0x00000001);
+  status |= test__addsf3(0x807fffff, 0x007ffffe, 0x80000001);
+  status |= test__addsf3(0x807fffff, 0x007fffff, 0x00000000);
+  status |= test__addsf3(0x807fffff, 0x00800000, 0x00000001);
+  status |= test__addsf3(0x807fffff, 0x807fffff, 0x80fffffe);
+  status |= test__addsf3(0x80800000, 0x00000000, 0x80800000);
+  status |= test__addsf3(0x80800000, 0x00800000, 0x00000000);
+  status |= test__addsf3(0x80800001, 0x00800000, 0x80000001);
+  status |= test__addsf3(0x80800001, 0x00800002, 0x00000001);
+  status |= test__addsf3(0x80ffffff, 0x01000000, 0x00000001);
+  status |= test__addsf3(0x80ffffff, 0x01000002, 0x00000005);
+  status |= test__addsf3(0x80ffffff, 0x01000004, 0x00000009);
+  status |= test__addsf3(0x81000000, 0x00ffffff, 0x80000001);
+  status |= test__addsf3(0x81000001, 0x00800001, 0x80800001);
+  status |= test__addsf3(0x81000001, 0x00ffffff, 0x80000003);
+  status |= test__addsf3(0x81000002, 0x00800001, 0x80800003);
+  status |= test__addsf3(0x817fffff, 0x01800000, 0x00000002);
+  status |= test__addsf3(0x81800000, 0x017fffff, 0x80000002);
+  status |= test__addsf3(0x81800001, 0x017fffff, 0x80000006);
+  status |= test__addsf3(0x81800002, 0x01000003, 0x81000001);
+  status |= test__addsf3(0xbf800000, 0x80000000, 0xbf800000);
+  status |= test__addsf3(0xbf800000, 0xbf800003, 0xc0000002);
+  status |= test__addsf3(0xbf800001, 0x3f800000, 0xb4000000);
+  status |= test__addsf3(0xbf800001, 0x3f800002, 0x34000000);
+  status |= test__addsf3(0xbf800001, 0xbf800000, 0xc0000000);
+  status |= test__addsf3(0xbffffffc, 0x3ffffffd, 0x34000000);
+  status |= test__addsf3(0xbfffffff, 0x00000001, 0xbfffffff);
+  status |= test__addsf3(0xbfffffff, 0x40000000, 0x34000000);
+  status |= test__addsf3(0xc0000000, 0x3fffffff, 0xb4000000);
+  status |= test__addsf3(0xc0000000, 0x40000001, 0x34800000);
+  status |= test__addsf3(0xc0000000, 0xc0000001, 0xc0800000);
+  status |= test__addsf3(0xc0000001, 0x3f800001, 0xbf800001);
+  status |= test__addsf3(0xc0000001, 0xc0000002, 0xc0800002);
+  status |= test__addsf3(0xc0000002, 0x3f800001, 0xbf800003);
+  status |= test__addsf3(0xc0000002, 0x3f800003, 0xbf800001);
+  status |= test__addsf3(0xc0000004, 0x40000003, 0xb4800000);
+  status |= test__addsf3(0xc0400000, 0x40400000, 0x00000000);
+  status |= test__addsf3(0xc07fffff, 0x407ffffe, 0xb4800000);
+  status |= test__addsf3(0xc07fffff, 0x40800002, 0x35a00000);
+  status |= test__addsf3(0xc07fffff, 0xb3ffffff, 0xc07fffff);
+  status |= test__addsf3(0xc07fffff, 0xb4000000, 0xc0800000);
+  status |= test__addsf3(0xc0800001, 0x407fffff, 0xb5400000);
+  status |= test__addsf3(0xfd800001, 0x7d7fffff, 0xf2400000);
+  status |= test__addsf3(0xfe7fffff, 0x7e7ffffe, 0xf2800000);
+  status |= test__addsf3(0xfe7fffff, 0x7e800002, 0x73a00000);
+  status |= test__addsf3(0xfe800000, 0x7e7fffff, 0xf2800000);
+  status |= test__addsf3(0xfe800000, 0x7e800001, 0x73000000);
+  status |= test__addsf3(0xfe800001, 0x7f000001, 0x7e800001);
+  status |= test__addsf3(0xfe800001, 0xfe800000, 0xff000000);
+  status |= test__addsf3(0xfe800002, 0x7e000003, 0xfe000001);
+  status |= test__addsf3(0xfe800004, 0x7e800003, 0xf3000000);
+  status |= test__addsf3(0xfefffffe, 0x7efffffe, 0x00000000);
+  status |= test__addsf3(0xfefffffe, 0xfefffffe, 0xff7ffffe);
+  status |= test__addsf3(0xfefffffe, 0xfeffffff, 0xff7ffffe);
+  status |= test__addsf3(0xfeffffff, 0x3f800000, 0xfeffffff);
+  status |= test__addsf3(0xfeffffff, 0x7f000000, 0x73000000);
+  status |= test__addsf3(0xfeffffff, 0xbf800000, 0xfeffffff);
+  status |= test__addsf3(0xfeffffff, 0xff000000, 0xff800000);
+  status |= test__addsf3(0xff000000, 0x00000000, 0xff000000);
+  status |= test__addsf3(0xff000000, 0x3f800000, 0xff000000);
+  status |= test__addsf3(0xff000000, 0x7f800000, 0x7f800000);
+  status |= test__addsf3(0xff000000, 0x80000000, 0xff000000);
+  status |= test__addsf3(0xff000000, 0xbf800000, 0xff000000);
+  status |= test__addsf3(0xff000000, 0xff000000, 0xff800000);
+  status |= test__addsf3(0xff000000, 0xff800000, 0xff800000);
+  status |= test__addsf3(0xff000001, 0x7f000000, 0xf3800000);
+  status |= test__addsf3(0xff000001, 0x7f000002, 0x73800000);
+  status |= test__addsf3(0xff000001, 0xff000000, 0xff800000);
+  status |= test__addsf3(0xff000002, 0x7e800001, 0xfe800003);
+  status |= test__addsf3(0xff7ffffe, 0x3f800000, 0xff7ffffe);
+  status |= test__addsf3(0xff7ffffe, 0x7f7fffff, 0x73800000);
+  status |= test__addsf3(0xff7ffffe, 0xbf800000, 0xff7ffffe);
+  status |= test__addsf3(0xff7ffffe, 0xff7ffffe, 0xff800000);
+  status |= test__addsf3(0xff7ffffe, 0xff7fffff, 0xff800000);
+  status |= test__addsf3(0xff7fffff, 0x00000001, 0xff7fffff);
+  status |= test__addsf3(0xff7fffff, 0x3f800000, 0xff7fffff);
+  status |= test__addsf3(0xff7fffff, 0xbf800000, 0xff7fffff);
+  status |= test__addsf3(0xff800000, 0x00000000, 0xff800000);
+  status |= test__addsf3(0xff800000, 0x007fffff, 0xff800000);
+  status |= test__addsf3(0xff800000, 0x7f000000, 0xff800000);
+  status |= test__addsf3(0xff800000, 0x80000000, 0xff800000);
+  status |= test__addsf3(0xff800000, 0x807fffff, 0xff800000);
+  status |= test__addsf3(0xff800000, 0xff000000, 0xff800000);
+  status |= test__addsf3(0xff800000, 0xff800000, 0xff800000);
+  status |= test__addsf3(0x7f7fffff, 0x74ffffff, 0x7f800000);
+  status |= test__addsf3(0x3f7fffff, 0x34004000, 0x3f800001);
+  status |= test__addsf3(0x3f800001, 0x23800000, 0x3f800001);
+  status |= test__addsf3(0xbbebe66d, 0x3b267c1f, 0xbb98a85e);
+  status |= test__addsf3(0x01f5b166, 0x81339a37, 0x019be44a);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__addsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__addsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__addsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__addsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__addsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/addsf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
+
+  status |= test__addsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__addsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__addsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__addsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__addsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__addsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__addsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__addsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__addsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__addsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__addsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__addsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__addsf3(0x7f800000, 0xff800000, 0x7fc00000);
+  status |= test__addsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__addsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__addsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__addsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__addsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__addsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__addsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__addsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__addsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__addsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__addsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__addsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__addsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__addsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__addsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__addsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__addsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__addsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__addsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__addsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__addsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__addsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__addsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__addsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__addsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__addsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__addsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__addsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__addsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__addsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__addsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__addsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__addsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__addsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__addsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__addsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__addsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__addsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__addsf3(0xff800000, 0x7f800000, 0x7fc00000);
+  status |= test__addsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__addsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}
diff --git a/compiler-rt/test/builtins/Unit/subsf3_test.c b/compiler-rt/test/builtins/Unit/subsf3_test.c
new file mode 100644
index 0000000000000..b9c1b2ac4362a
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/subsf3_test.c
@@ -0,0 +1,382 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_addsf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultF to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7fc00000. For the Arm assembler FP implementations, which commit to a
+// more detailed handling of NaNs, we tighten up the check and include some
+// extra test cases specific to that NaN policy.
+#if __arm__ || __thumb__
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a - b
+COMPILER_RT_ABI float __subsf3(float a, float b);
+
+int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+  float x = __subsf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep32(x) != expected_rep;
+#else
+  int ret = compareResultF(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error at line %d: __subsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
+           ", expected %08" PRIx32 "\n",
+           line, a_rep, b_rep, toRep32(x), expected_rep);
+  }
+  return ret;
+}
+
+#define test__subsf3(a,b,x) test__subsf3(__LINE__,a,b,x)
+
+int main() {
+  int status = 0;
+
+  status |= test__subsf3(0x00000000, 0x00000000, 0x00000000);
+  status |= test__subsf3(0x00000000, 0x007fffff, 0x807fffff);
+  status |= test__subsf3(0x00000000, 0x00800000, 0x80800000);
+  status |= test__subsf3(0x00000000, 0x7f800000, 0xff800000);
+  status |= test__subsf3(0x00000000, 0x80000000, 0x00000000);
+  status |= test__subsf3(0x00000000, 0x807fffff, 0x007fffff);
+  status |= test__subsf3(0x00000000, 0xbf800000, 0x3f800000);
+  status |= test__subsf3(0x00000000, 0xff000000, 0x7f000000);
+  status |= test__subsf3(0x00000000, 0xff800000, 0x7f800000);
+  status |= test__subsf3(0x00000001, 0x00000001, 0x00000000);
+  status |= test__subsf3(0x00000001, 0x80000001, 0x00000002);
+  status |= test__subsf3(0x00000001, 0xbf7fffff, 0x3f7fffff);
+  status |= test__subsf3(0x00000001, 0xbf800000, 0x3f800000);
+  status |= test__subsf3(0x00000001, 0xbffffffe, 0x3ffffffe);
+  status |= test__subsf3(0x00000001, 0xbfffffff, 0x3fffffff);
+  status |= test__subsf3(0x00000001, 0xfeffffff, 0x7effffff);
+  status |= test__subsf3(0x00000001, 0xff000000, 0x7f000000);
+  status |= test__subsf3(0x00000001, 0xff7ffffe, 0x7f7ffffe);
+  status |= test__subsf3(0x00000001, 0xff7fffff, 0x7f7fffff);
+  status |= test__subsf3(0x00000002, 0x00000001, 0x00000001);
+  status |= test__subsf3(0x00000003, 0x00000000, 0x00000003);
+  status |= test__subsf3(0x00000003, 0x00000002, 0x00000001);
+  status |= test__subsf3(0x00000003, 0x40a00000, 0xc0a00000);
+  status |= test__subsf3(0x00000003, 0x7f000000, 0xff000000);
+  status |= test__subsf3(0x00000003, 0x7f800000, 0xff800000);
+  status |= test__subsf3(0x00000003, 0x80000000, 0x00000003);
+  status |= test__subsf3(0x00000003, 0xff800000, 0x7f800000);
+  status |= test__subsf3(0x00000004, 0x80000004, 0x00000008);
+  status |= test__subsf3(0x007ffffc, 0x007ffffc, 0x00000000);
+  status |= test__subsf3(0x007ffffd, 0x007ffffe, 0x80000001);
+  status |= test__subsf3(0x007fffff, 0x007ffffe, 0x00000001);
+  status |= test__subsf3(0x007fffff, 0x00800000, 0x80000001);
+  status |= test__subsf3(0x007fffff, 0x807fffff, 0x00fffffe);
+  status |= test__subsf3(0x00800000, 0x00800000, 0x00000000);
+  status |= test__subsf3(0x00800000, 0x80000000, 0x00800000);
+  status |= test__subsf3(0x00800000, 0x80800000, 0x01000000);
+  status |= test__subsf3(0x00800001, 0x00800000, 0x00000001);
+  status |= test__subsf3(0x00800001, 0x00800002, 0x80000001);
+  status |= test__subsf3(0x00ffffff, 0x01000000, 0x80000001);
+  status |= test__subsf3(0x00ffffff, 0x01000002, 0x80000005);
+  status |= test__subsf3(0x00ffffff, 0x01000004, 0x80000009);
+  status |= test__subsf3(0x01000000, 0x00ffffff, 0x00000001);
+  status |= test__subsf3(0x01000001, 0x00800001, 0x00800001);
+  status |= test__subsf3(0x01000001, 0x00ffffff, 0x00000003);
+  status |= test__subsf3(0x01000002, 0x00800001, 0x00800003);
+  status |= test__subsf3(0x017fffff, 0x01800000, 0x80000002);
+  status |= test__subsf3(0x01800000, 0x017fffff, 0x00000002);
+  status |= test__subsf3(0x01800001, 0x017fffff, 0x00000006);
+  status |= test__subsf3(0x01800002, 0x01000003, 0x01000001);
+  status |= test__subsf3(0x3f7fffff, 0x00000001, 0x3f7fffff);
+  status |= test__subsf3(0x3f800000, 0x00000000, 0x3f800000);
+  status |= test__subsf3(0x3f800000, 0x3f800000, 0x00000000);
+  status |= test__subsf3(0x3f800000, 0xbf800000, 0x40000000);
+  status |= test__subsf3(0x3f800000, 0xbf800003, 0x40000002);
+  status |= test__subsf3(0x3f800000, 0xc0000000, 0x40400000);
+  status |= test__subsf3(0x3f800000, 0xc0e00000, 0x41000000);
+  status |= test__subsf3(0x3f800001, 0x3f800000, 0x34000000);
+  status |= test__subsf3(0x3f800001, 0x3f800002, 0xb4000000);
+  status |= test__subsf3(0x3f800001, 0xbf800000, 0x40000000);
+  status |= test__subsf3(0x3ffffffc, 0x3ffffffd, 0xb4000000);
+  status |= test__subsf3(0x3fffffff, 0x40000000, 0xb4000000);
+  status |= test__subsf3(0x40000000, 0x3fffffff, 0x34000000);
+  status |= test__subsf3(0x40000000, 0x40000000, 0x00000000);
+  status |= test__subsf3(0x40000000, 0x40000001, 0xb4800000);
+  status |= test__subsf3(0x40000000, 0x40a00000, 0xc0400000);
+  status |= test__subsf3(0x40000000, 0xb4000000, 0x40000000);
+  status |= test__subsf3(0x40000000, 0xbf800000, 0x40400000);
+  status |= test__subsf3(0x40000000, 0xc0000000, 0x40800000);
+  status |= test__subsf3(0x40000000, 0xc0000001, 0x40800000);
+  status |= test__subsf3(0x40000001, 0x3f800001, 0x3f800001);
+  status |= test__subsf3(0x40000001, 0xb4000000, 0x40000002);
+  status |= test__subsf3(0x40000001, 0xc0000002, 0x40800002);
+  status |= test__subsf3(0x40000002, 0x3f800001, 0x3f800003);
+  status |= test__subsf3(0x40000002, 0x3f800003, 0x3f800001);
+  status |= test__subsf3(0x40000004, 0x40000003, 0x34800000);
+  status |= test__subsf3(0x40400000, 0xc0400000, 0x40c00000);
+  status |= test__subsf3(0x407fffff, 0x407ffffe, 0x34800000);
+  status |= test__subsf3(0x407fffff, 0x40800002, 0xb5a00000);
+  status |= test__subsf3(0x407fffff, 0xb3ffffff, 0x407fffff);
+  status |= test__subsf3(0x407fffff, 0xb4000000, 0x40800000);
+  status |= test__subsf3(0x40800001, 0x407fffff, 0x35400000);
+  status |= test__subsf3(0x40a00000, 0x00000000, 0x40a00000);
+  status |= test__subsf3(0x40a00000, 0x3f800000, 0x40800000);
+  status |= test__subsf3(0x40a00000, 0x40a00000, 0x00000000);
+  status |= test__subsf3(0x40a00000, 0x80000000, 0x40a00000);
+  status |= test__subsf3(0x7d800001, 0x7d7fffff, 0x72400000);
+  status |= test__subsf3(0x7e7fffff, 0x7e7ffffe, 0x72800000);
+  status |= test__subsf3(0x7e7fffff, 0x7e800002, 0xf3a00000);
+  status |= test__subsf3(0x7e800000, 0x7e7fffff, 0x72800000);
+  status |= test__subsf3(0x7e800000, 0x7e800001, 0xf3000000);
+  status |= test__subsf3(0x7e800000, 0xfe800000, 0x7f000000);
+  status |= test__subsf3(0x7e800001, 0x7f000001, 0xfe800001);
+  status |= test__subsf3(0x7e800001, 0xfe800000, 0x7f000000);
+  status |= test__subsf3(0x7e800002, 0x7e000003, 0x7e000001);
+  status |= test__subsf3(0x7e800004, 0x7e800003, 0x73000000);
+  status |= test__subsf3(0x7efffffe, 0xfefffffe, 0x7f7ffffe);
+  status |= test__subsf3(0x7efffffe, 0xfeffffff, 0x7f7ffffe);
+  status |= test__subsf3(0x7effffff, 0x3f800000, 0x7effffff);
+  status |= test__subsf3(0x7effffff, 0x7f000000, 0xf3000000);
+  status |= test__subsf3(0x7effffff, 0xbf800000, 0x7effffff);
+  status |= test__subsf3(0x7effffff, 0xff000000, 0x7f800000);
+  status |= test__subsf3(0x7f000000, 0x3f800000, 0x7f000000);
+  status |= test__subsf3(0x7f000000, 0x7f000000, 0x00000000);
+  status |= test__subsf3(0x7f000000, 0x7f800000, 0xff800000);
+  status |= test__subsf3(0x7f000000, 0xbf800000, 0x7f000000);
+  status |= test__subsf3(0x7f000000, 0xff000000, 0x7f800000);
+  status |= test__subsf3(0x7f000000, 0xff800000, 0x7f800000);
+  status |= test__subsf3(0x7f000001, 0x7f000000, 0x73800000);
+  status |= test__subsf3(0x7f000001, 0x7f000002, 0xf3800000);
+  status |= test__subsf3(0x7f000001, 0xff000000, 0x7f800000);
+  status |= test__subsf3(0x7f000002, 0x7e800001, 0x7e800003);
+  status |= test__subsf3(0x7f7ffffe, 0x3f800000, 0x7f7ffffe);
+  status |= test__subsf3(0x7f7ffffe, 0x7f7fffff, 0xf3800000);
+  status |= test__subsf3(0x7f7ffffe, 0xbf800000, 0x7f7ffffe);
+  status |= test__subsf3(0x7f7ffffe, 0xff7ffffe, 0x7f800000);
+  status |= test__subsf3(0x7f7ffffe, 0xff7fffff, 0x7f800000);
+  status |= test__subsf3(0x7f7fffff, 0x00000001, 0x7f7fffff);
+  status |= test__subsf3(0x7f7fffff, 0x3f800000, 0x7f7fffff);
+  status |= test__subsf3(0x7f7fffff, 0x7f7fffff, 0x00000000);
+  status |= test__subsf3(0x7f7fffff, 0xbf800000, 0x7f7fffff);
+  status |= test__subsf3(0x7f800000, 0x00000000, 0x7f800000);
+  status |= test__subsf3(0x7f800000, 0x007fffff, 0x7f800000);
+  status |= test__subsf3(0x7f800000, 0x7f000000, 0x7f800000);
+  status |= test__subsf3(0x7f800000, 0x80000000, 0x7f800000);
+  status |= test__subsf3(0x7f800000, 0x807fffff, 0x7f800000);
+  status |= test__subsf3(0x7f800000, 0xff000000, 0x7f800000);
+  status |= test__subsf3(0x7f800000, 0xff800000, 0x7f800000);
+  status |= test__subsf3(0x80000000, 0x00000000, 0x80000000);
+  status |= test__subsf3(0x80000000, 0x007fffff, 0x807fffff);
+  status |= test__subsf3(0x80000000, 0x00800000, 0x80800000);
+  status |= test__subsf3(0x80000000, 0x3f800000, 0xbf800000);
+  status |= test__subsf3(0x80000000, 0x7f800000, 0xff800000);
+  status |= test__subsf3(0x80000000, 0x80000000, 0x00000000);
+  status |= test__subsf3(0x80000000, 0x807fffff, 0x007fffff);
+  status |= test__subsf3(0x80000000, 0xff000000, 0x7f000000);
+  status |= test__subsf3(0x80000000, 0xff800000, 0x7f800000);
+  status |= test__subsf3(0x80000001, 0x00000001, 0x80000002);
+  status |= test__subsf3(0x80000001, 0x3f7fffff, 0xbf7fffff);
+  status |= test__subsf3(0x80000001, 0x3f800000, 0xbf800000);
+  status |= test__subsf3(0x80000001, 0x3ffffffe, 0xbffffffe);
+  status |= test__subsf3(0x80000001, 0x3fffffff, 0xbfffffff);
+  status |= test__subsf3(0x80000001, 0x7effffff, 0xfeffffff);
+  status |= test__subsf3(0x80000001, 0x7f000000, 0xff000000);
+  status |= test__subsf3(0x80000001, 0x7f7ffffe, 0xff7ffffe);
+  status |= test__subsf3(0x80000001, 0x7f7fffff, 0xff7fffff);
+  status |= test__subsf3(0x80000001, 0x80000001, 0x00000000);
+  status |= test__subsf3(0x80000002, 0x80000001, 0x80000001);
+  status |= test__subsf3(0x80000003, 0x00000000, 0x80000003);
+  status |= test__subsf3(0x80000003, 0x7f800000, 0xff800000);
+  status |= test__subsf3(0x80000003, 0x80000000, 0x80000003);
+  status |= test__subsf3(0x80000003, 0x80000002, 0x80000001);
+  status |= test__subsf3(0x80000003, 0xc0400000, 0x40400000);
+  status |= test__subsf3(0x80000003, 0xff000000, 0x7f000000);
+  status |= test__subsf3(0x80000003, 0xff800000, 0x7f800000);
+  status |= test__subsf3(0x80000004, 0x00000004, 0x80000008);
+  status |= test__subsf3(0x807ffffd, 0x807ffffe, 0x00000001);
+  status |= test__subsf3(0x807fffff, 0x007fffff, 0x80fffffe);
+  status |= test__subsf3(0x807fffff, 0x807ffffe, 0x80000001);
+  status |= test__subsf3(0x807fffff, 0x807fffff, 0x00000000);
+  status |= test__subsf3(0x807fffff, 0x80800000, 0x00000001);
+  status |= test__subsf3(0x80800000, 0x80000000, 0x80800000);
+  status |= test__subsf3(0x80800000, 0x80800000, 0x00000000);
+  status |= test__subsf3(0x80800001, 0x80800000, 0x80000001);
+  status |= test__subsf3(0x80800001, 0x80800002, 0x00000001);
+  status |= test__subsf3(0x80ffffff, 0x81000000, 0x00000001);
+  status |= test__subsf3(0x80ffffff, 0x81000002, 0x00000005);
+  status |= test__subsf3(0x80ffffff, 0x81000004, 0x00000009);
+  status |= test__subsf3(0x81000000, 0x80ffffff, 0x80000001);
+  status |= test__subsf3(0x81000001, 0x80800001, 0x80800001);
+  status |= test__subsf3(0x81000001, 0x80ffffff, 0x80000003);
+  status |= test__subsf3(0x81000002, 0x80800001, 0x80800003);
+  status |= test__subsf3(0x817fffff, 0x81800000, 0x00000002);
+  status |= test__subsf3(0x81800000, 0x817fffff, 0x80000002);
+  status |= test__subsf3(0x81800001, 0x817fffff, 0x80000006);
+  status |= test__subsf3(0x81800002, 0x81000003, 0x81000001);
+  status |= test__subsf3(0xbf800000, 0x00000000, 0xbf800000);
+  status |= test__subsf3(0xbf800000, 0x3f800003, 0xc0000002);
+  status |= test__subsf3(0xbf800001, 0x3f800000, 0xc0000000);
+  status |= test__subsf3(0xbf800001, 0xbf800000, 0xb4000000);
+  status |= test__subsf3(0xbf800001, 0xbf800002, 0x34000000);
+  status |= test__subsf3(0xbffffffc, 0xbffffffd, 0x34000000);
+  status |= test__subsf3(0xbfffffff, 0x80000001, 0xbfffffff);
+  status |= test__subsf3(0xbfffffff, 0xc0000000, 0x34000000);
+  status |= test__subsf3(0xc0000000, 0x40000001, 0xc0800000);
+  status |= test__subsf3(0xc0000000, 0xbfffffff, 0xb4000000);
+  status |= test__subsf3(0xc0000000, 0xc0000001, 0x34800000);
+  status |= test__subsf3(0xc0000001, 0x40000002, 0xc0800002);
+  status |= test__subsf3(0xc0000001, 0xbf800001, 0xbf800001);
+  status |= test__subsf3(0xc0000002, 0xbf800001, 0xbf800003);
+  status |= test__subsf3(0xc0000002, 0xbf800003, 0xbf800001);
+  status |= test__subsf3(0xc0000004, 0xc0000003, 0xb4800000);
+  status |= test__subsf3(0xc0400000, 0xc0400000, 0x00000000);
+  status |= test__subsf3(0xc07fffff, 0x33ffffff, 0xc07fffff);
+  status |= test__subsf3(0xc07fffff, 0x34000000, 0xc0800000);
+  status |= test__subsf3(0xc07fffff, 0xc07ffffe, 0xb4800000);
+  status |= test__subsf3(0xc07fffff, 0xc0800002, 0x35a00000);
+  status |= test__subsf3(0xc0800001, 0xc07fffff, 0xb5400000);
+  status |= test__subsf3(0xfd800001, 0xfd7fffff, 0xf2400000);
+  status |= test__subsf3(0xfe7fffff, 0xfe7ffffe, 0xf2800000);
+  status |= test__subsf3(0xfe7fffff, 0xfe800002, 0x73a00000);
+  status |= test__subsf3(0xfe800000, 0xfe7fffff, 0xf2800000);
+  status |= test__subsf3(0xfe800000, 0xfe800001, 0x73000000);
+  status |= test__subsf3(0xfe800001, 0x7e800000, 0xff000000);
+  status |= test__subsf3(0xfe800001, 0xff000001, 0x7e800001);
+  status |= test__subsf3(0xfe800002, 0xfe000003, 0xfe000001);
+  status |= test__subsf3(0xfe800004, 0xfe800003, 0xf3000000);
+  status |= test__subsf3(0xfefffffe, 0x7efffffe, 0xff7ffffe);
+  status |= test__subsf3(0xfefffffe, 0x7effffff, 0xff7ffffe);
+  status |= test__subsf3(0xfefffffe, 0xfefffffe, 0x00000000);
+  status |= test__subsf3(0xfeffffff, 0x3f800000, 0xfeffffff);
+  status |= test__subsf3(0xfeffffff, 0x7f000000, 0xff800000);
+  status |= test__subsf3(0xfeffffff, 0xbf800000, 0xfeffffff);
+  status |= test__subsf3(0xfeffffff, 0xff000000, 0x73000000);
+  status |= test__subsf3(0xff000000, 0x00000000, 0xff000000);
+  status |= test__subsf3(0xff000000, 0x3f800000, 0xff000000);
+  status |= test__subsf3(0xff000000, 0x7f000000, 0xff800000);
+  status |= test__subsf3(0xff000000, 0x7f800000, 0xff800000);
+  status |= test__subsf3(0xff000000, 0x80000000, 0xff000000);
+  status |= test__subsf3(0xff000000, 0xbf800000, 0xff000000);
+  status |= test__subsf3(0xff000000, 0xff800000, 0x7f800000);
+  status |= test__subsf3(0xff000001, 0x7f000000, 0xff800000);
+  status |= test__subsf3(0xff000001, 0xff000000, 0xf3800000);
+  status |= test__subsf3(0xff000001, 0xff000002, 0x73800000);
+  status |= test__subsf3(0xff000002, 0xfe800001, 0xfe800003);
+  status |= test__subsf3(0xff7ffffe, 0x3f800000, 0xff7ffffe);
+  status |= test__subsf3(0xff7ffffe, 0x7f7ffffe, 0xff800000);
+  status |= test__subsf3(0xff7ffffe, 0x7f7fffff, 0xff800000);
+  status |= test__subsf3(0xff7ffffe, 0xbf800000, 0xff7ffffe);
+  status |= test__subsf3(0xff7ffffe, 0xff7fffff, 0x73800000);
+  status |= test__subsf3(0xff7fffff, 0x3f800000, 0xff7fffff);
+  status |= test__subsf3(0xff7fffff, 0x80000001, 0xff7fffff);
+  status |= test__subsf3(0xff7fffff, 0xbf800000, 0xff7fffff);
+  status |= test__subsf3(0xff800000, 0x00000000, 0xff800000);
+  status |= test__subsf3(0xff800000, 0x007fffff, 0xff800000);
+  status |= test__subsf3(0xff800000, 0x7f000000, 0xff800000);
+  status |= test__subsf3(0xff800000, 0x7f800000, 0xff800000);
+  status |= test__subsf3(0xff800000, 0x80000000, 0xff800000);
+  status |= test__subsf3(0xff800000, 0x807fffff, 0xff800000);
+  status |= test__subsf3(0xff800000, 0xff000000, 0xff800000);
+  status |= test__subsf3(0x46f99cee, 0x4656466d, 0x468e79b8);
+  status |= test__subsf3(0x007ffff7, 0x00f7ffff, 0x80780008);
+  status |= test__subsf3(0x80ffffbf, 0x80800000, 0x807fffbf);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000,
+  // which causes compareResultF to accept any NaN encoding. We also use the
+  // same value as the input NaN in tests that have one, so that even in
+  // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is
+  // still the exact expected NaN.
+  status |= test__subsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__subsf3(0xff800000, 0xff800000, 0x7fc00000);
+  status |= test__subsf3(0x3f800000, 0x7fc00000, 0x7fc00000);
+  status |= test__subsf3(0x7fc00000, 0x3f800000, 0x7fc00000);
+  status |= test__subsf3(0x7fc00000, 0x7fc00000, 0x7fc00000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by the
+  // subtraction function in arm/addsf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7fc00000.
+
+  status |= test__subsf3(0x00000000, 0x7fad4be3, 0x7fed4be3);
+  status |= test__subsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7);
+  status |= test__subsf3(0x00000001, 0x7f970eba, 0x7fd70eba);
+  status |= test__subsf3(0x00000001, 0x7fc35716, 0x7fc35716);
+  status |= test__subsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6);
+  status |= test__subsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df);
+  status |= test__subsf3(0x3f800000, 0x7f987a85, 0x7fd87a85);
+  status |= test__subsf3(0x3f800000, 0x7fc50124, 0x7fc50124);
+  status |= test__subsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f);
+  status |= test__subsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc);
+  status |= test__subsf3(0x7f800000, 0x7f800000, 0x7fc00000);
+  status |= test__subsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790);
+  status |= test__subsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b);
+  status |= test__subsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d);
+  status |= test__subsf3(0x7f93541e, 0x00000001, 0x7fd3541e);
+  status |= test__subsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002);
+  status |= test__subsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77);
+  status |= test__subsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92);
+  status |= test__subsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36);
+  status |= test__subsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008);
+  status |= test__subsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740);
+  status |= test__subsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b);
+  status |= test__subsf3(0x7f951a78, 0x80000001, 0x7fd51a78);
+  status |= test__subsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b);
+  status |= test__subsf3(0x7f89463c, 0xbf800000, 0x7fc9463c);
+  status |= test__subsf3(0x7fb63563, 0xff7fffff, 0x7ff63563);
+  status |= test__subsf3(0x7f90886e, 0xff800000, 0x7fd0886e);
+  status |= test__subsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e);
+  status |= test__subsf3(0x7fe915ae, 0x00000001, 0x7fe915ae);
+  status |= test__subsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42);
+  status |= test__subsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5);
+  status |= test__subsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb);
+  status |= test__subsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a);
+  status |= test__subsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816);
+  status |= test__subsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c);
+  status |= test__subsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb);
+  status |= test__subsf3(0x7ffa178b, 0x80000001, 0x7ffa178b);
+  status |= test__subsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b);
+  status |= test__subsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b);
+  status |= test__subsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c);
+  status |= test__subsf3(0x7fc55329, 0xff800000, 0x7fc55329);
+  status |= test__subsf3(0x80000000, 0x7fa833ae, 0x7fe833ae);
+  status |= test__subsf3(0x80000000, 0x7fc4df63, 0x7fc4df63);
+  status |= test__subsf3(0x80000001, 0x7f98827d, 0x7fd8827d);
+  status |= test__subsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5);
+  status |= test__subsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0);
+  status |= test__subsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907);
+  status |= test__subsf3(0xbf800000, 0x7fa95487, 0x7fe95487);
+  status |= test__subsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee);
+  status |= test__subsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21);
+  status |= test__subsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7);
+  status |= test__subsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc);
+  status |= test__subsf3(0xff800000, 0x7fde0397, 0x7fde0397);
+  status |= test__subsf3(0xff800000, 0xff800000, 0x7fc00000);
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}

>From 46e7005df1ec7b058114b7d864a597b48f2d2706 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 5 Feb 2026 13:49:21 +0000
Subject: [PATCH 2/3] clang-format

---
 compiler-rt/test/builtins/Unit/addsf3_test.c | 9 +++++----
 compiler-rt/test/builtins/Unit/subsf3_test.c | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/test/builtins/Unit/addsf3_test.c b/compiler-rt/test/builtins/Unit/addsf3_test.c
index a08ba8b91056a..5d20970047d8a 100644
--- a/compiler-rt/test/builtins/Unit/addsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/addsf3_test.c
@@ -24,7 +24,8 @@
 // Returns: a + b
 COMPILER_RT_ABI float __addsf3(float a, float b);
 
-int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep,
+                 uint32_t expected_rep) {
   float a = fromRep32(a_rep), b = fromRep32(b_rep);
   float x = __addsf3(a, b);
 #ifdef EXPECT_EXACT_RESULTS
@@ -34,14 +35,14 @@ int test__addsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep
 #endif
 
   if (ret) {
-    printf("error at line %d: __addsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
-           ", expected %08" PRIx32 "\n",
+    printf("error at line %d: __addsf3(%08" PRIx32 ", %08" PRIx32
+           ") = %08" PRIx32 ", expected %08" PRIx32 "\n",
            line, a_rep, b_rep, toRep32(x), expected_rep);
   }
   return ret;
 }
 
-#define test__addsf3(a,b,x) (test__addsf3)(__LINE__,a,b,x)
+#define test__addsf3(a, b, x) (test__addsf3)(__LINE__, a, b, x)
 
 int main() {
   int status = 0;
diff --git a/compiler-rt/test/builtins/Unit/subsf3_test.c b/compiler-rt/test/builtins/Unit/subsf3_test.c
index b9c1b2ac4362a..11a87d1526785 100644
--- a/compiler-rt/test/builtins/Unit/subsf3_test.c
+++ b/compiler-rt/test/builtins/Unit/subsf3_test.c
@@ -24,7 +24,8 @@
 // Returns: a - b
 COMPILER_RT_ABI float __subsf3(float a, float b);
 
-int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) {
+int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep,
+                 uint32_t expected_rep) {
   float a = fromRep32(a_rep), b = fromRep32(b_rep);
   float x = __subsf3(a, b);
 #ifdef EXPECT_EXACT_RESULTS
@@ -34,14 +35,14 @@ int test__subsf3(int line, uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep
 #endif
 
   if (ret) {
-    printf("error at line %d: __subsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32
-           ", expected %08" PRIx32 "\n",
+    printf("error at line %d: __subsf3(%08" PRIx32 ", %08" PRIx32
+           ") = %08" PRIx32 ", expected %08" PRIx32 "\n",
            line, a_rep, b_rep, toRep32(x), expected_rep);
   }
   return ret;
 }
 
-#define test__subsf3(a,b,x) test__subsf3(__LINE__,a,b,x)
+#define test__subsf3(a, b, x) test__subsf3(__LINE__, a, b, x)
 
 int main() {
   int status = 0;

>From 0c7ea5933fe27b3d6526f3cac2f20fdcc794dc0f Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham at arm.com>
Date: Thu, 5 Feb 2026 17:09:40 +0000
Subject: [PATCH 3/3] Update to use set_special_properties

---
 compiler-rt/lib/builtins/CMakeLists.txt | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 1366d4aa75c03..c741f9bf9c3d9 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -535,8 +535,7 @@ set(thumb1_base_SOURCES
   ${GENERIC_SOURCES}
 )
 # arm/addsf3.S implements both addition and subtraction via cross-branching
-set_property(SOURCE arm/addsf3.S PROPERTY crt_supersedes subsf3.c)
-set_property(SOURCE arm/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
+set_special_properties(arm/addsf3.S SUPERSEDES subsf3.c PROVIDES subsf3)
 set_special_properties(arm/adddf3.S SUPERSEDES subdf3.c PROVIDES subdf3)
 
 if(COMPILER_RT_ARM_OPTIMIZED_FP)
@@ -554,8 +553,8 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP)
     arm/funder.c
     ${thumb1_base_SOURCES}
   )
-  set_property(SOURCE arm/thumb1/addsf3.S PROPERTY crt_supersedes subsf3.c)
-  set_property(SOURCE arm/thumb1/addsf3.S DIRECTORY ${COMPILER_RT_SOURCE_DIR} PROPERTY crt_provides subsf3)
+  set_special_properties(arm/thumb1/addsf3.S
+    SUPERSEDES subsf3.c PROVIDES subsf3)
   set_special_properties(arm/thumb1/cmpdf2.S
     SUPERSEDES comparedf2.c PROVIDES comparedf2)
   set_special_properties(arm/thumb1/cmpsf2.S



More information about the llvm-branch-commits mailing list