[compiler-rt] [compiler-rt][ARM] Optimized mulsf3 and divsf3 (PR #161546)

Wed Oct 1 10:56:49 PDT 2025

================
@@ -0,0 +1,608 @@
+//===-- divsf3.S - single-precision floating point division ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements single-precision soft-float division with the IEEE-754
+// default rounding (to nearest, ties to even), in optimized AArch32 assembly
+// language suitable to be built as either Arm or Thumb2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fdiv, __divsf3)
+
+DEFINE_COMPILERRT_FUNCTION(__divsf3)
+  // Extract the exponents of the inputs into r2 and r3, occupying bits 16-23
+  // of each register so that there will be space lower down to store extra
+  // data without exponent arithmetic carrying into it. In the process, check
+  // both exponents for 00 or FF and branch out of line to handle all the
+  // uncommon types of value (infinity, NaN, zero, denormals).
+  //
+  // Chaining conditional instructions like this means that the second
+  // instruction (setting up r3) might not be executed at all, so fdiv_uncommon
+  // will have to redo it just in case. That saves an instruction here,
+  // executed for _all_ inputs, and moves it to the uncommon path run for only
+  // some inputs.
+  MOV     r12, #0xFF0000
+  ANDS    r2, r12, r0, LSR #7   // r2 has exponent of numerator. (Is it 0?)
+  ANDSNE  r3, r12, r1, LSR #7   // r3 has exponent of denominator. (Is it 0?)
+  TEQNE   r2, r12               // if neither was 0, is one FF?
+  TEQNE   r3, r12               // or the other?
+  BEQ     LOCAL_LABEL(uncommon)         // branch out of line if any answer was yes
+
+  // Calculate the output sign, which is always just the XOR of the input
+  // signs. Store it in bit 8 of r2, below the numerator exponent.
+  TEQ     r0, r1                // is the output sign bit 1?
+  ORRMI   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Isolate the mantissas of both values, by setting bit 23 of each one and
+  // clearing the 8 bits above that.
+  //
+  // In the process, swap the register allocations (which doesn't cost extra
+  // instructions if we do it as part of this manipulation). We want the
+  // numerator not to be in r0, because r0 is where we'll build up the quotient
+  // while subtracting things from the numerator.
+  ORR     r12, r0, #1 << 23
+  ORR     r0, r1, #1 << 23
+  BIC     r1, r12, #0xFF000000
+  BIC     r0, r0, #0xFF000000
+
+LOCAL_LABEL(div):
+  // Start of the main division. We get here knowing that:
+  //
+  //   r0 = mantissa of denominator, with the leading 1 at bit 23
+  //   r1 = mantissa of numerator, similarly
+  //   r2 = (exponent of numerator << 16) + (result sign << 8)
+  //   r3 = (exponent of denominator << 16)
+
+  PUSH    {r14}                 // we'll need an extra register
+
+  // Calculate the initial result exponent by just subtracting the two input
+  // exponents. This doesn't affect the sign bit lower down in r2.
+  SUB     r2, r2, r3
+
+  // That initial exponent might need to be adjusted by 1, depending on whether
+  // dividing the mantissas gives a value >=1 or <1. We don't need to wait
+  // until the division is finished to work that out: we can tell immediately
+  // by just comparing the mantissas.
+  //
+  // The basic idea is to do the comparison in a way that sets the C flag if
+  // numerator >= denominator. Then we recombine the sign and exponent by doing
+  // "ADC r2, r2, r2, ASR #16": the exponent in the top half of r2 is shifted
+  // down to the low 8 bits, just below the sign bit, and using ADC rather than
+  // ADD folds in the conditional increment from the mantissa comparison.
+  //
+  // If we're not incrementing the output exponent, we instead shift the
+  // numerator mantissa left by 1, so that it _is_ greater than the denominator
+  // mantissa. Otherwise we'd generate only a 22-bit quotient, instead of 23.
+  //
+  // The exponent also needs to be rebiased, so that dividing two numbers the
+  // same gives an output exponent of 0x7F. If the two inputs have the same
+  // exponent then we'll have computed an exponent of 0 via the SUB instruction
+  // above; if the mantissas are the same as well then the ADC will increment
+  // it; also, the leading bit of the quotient will increment the exponent
+  // again when we recombine it with the output mantissa later. So we need to
+  // add (0x7F - 2) to the mantissa now, to make an exponent of 0 from the SUB
+  // come to 0x7F after both of those increments.
+  //
+  // Putting all of that together, what we _want_ to do is this:
+  //
+  // [#1]   CMP     r1, r0                // set C if num >= den
+  // [#2]   MOVLO   r1, r1, LSL #1        // if num < den, shift num left
+  // [#3]   ADD     r2, r2, #0x7D0000     // rebias exponent
+  // [#4]   ADC     r2, r2, r2, ASR #16   // combine sign + exp + adjustment
+  //
+  // However, we only do the first of those four instructions right here. The
+  // other three are distributed through the code below, after unrelated load
+  // or multiply instructions which will have a result delay slot on simple
+  // CPUs. Each is labelled "exponent setup [#n]" in a comment.
+  //
+  // (Since instruction #4 depends on the flags set up by #2, we must avoid
+  // clobbering the flags in _any_ of the instructions interleaved with this!)
+  CMP     r1, r0                // exponent setup [#1]
+
+  // Start the mantissa division by making an approximation to the reciprocal
+  // of the denominator. We first obtain an 8-bit approximation using a table
+  // lookup indexed by the top 7 denominator bits (counting the leading 1, so
+  // really there are only 6 bits in the table index).
+  //
+  // (r0 >> 17) is the table index, and its top bit is always set, so it ranges
+  // from 64 to 127 inclusive. So we point the base register 64 bytes before
+  // the actual table.
+  ADR     r12, LOCAL_LABEL(tab) - 64
+#if __thumb__
+  // Thumb can't do this particular shift+add+load in one instruction - it only
+  // supports left shifts of 0 to 3 bits, not right shifts of 17. So we must
+  // calculate the load offset separately.
+  ADD     r14, r12, r0, LSR #17
+  LDRB    r14, [r14]
+#else
+  LDRB    r14, [r12, r0, LSR #17]
+#endif
+
+  // Now do an iteration of Newton-Raphson to improve that 8-bit approximation
+  // to have 15-16 accurate bits.
+  //
+  // Basics of Newton-Raphson for finding a reciprocal: if you want to find 1/d
+  // and you have some approximation x, your next approximation is X = x(2-dx).
+  // Looked at one way, this is the result of applying the N-R formula
+  // X=x-f(x)/f'(x) to the function f(x) = 1/x - d. Another way to look at it
+  // is to suppose that dx = 1 - e, for some e which is small (because dx is
+  // already reasonably close to 1). Then you want to double the number of
+  // correct bits in the next approximation, i.e. square the error. So you want
+  // dX = 1-e^2 = (1-e)(1+e) = dx(2-dx). Cancelling d gives X = x(2-dx) again.
+  //
+  // In this situation, we're working in fixed-point integers rather than real
+  // numbers, and all the scales are different:
+  //  * our input denominator d is in the range [2^23,2^24)
+  //  * our input approximation x is in the range [2^7,2^8)
+  //  * we want the output approximation to be in the range [2^15,2^16)
+  // Those factors combine to mean that we want
+  //   x(2^32-dx) / 2^23
+  // = (2^9 x) - (dx^2 / 2^23)
+  //
+  // But we also want to compute this using ordinary MUL, not a long multiply
+  // instruction (those are slower). So we need to worry about the product
+  // overflowing. dx fits in 32 bits, because it's the product of something
+  // <2^24 with something <2^8; but we must shift it right before multiplying
+  // by x again.
+
+  MUL     r12, r0, r14          // r12  = dx
+  MOVLO   r1, r1, LSL #1        //   exponent setup [#2] in the MUL delay slot
+  MVN     r12, r12, LSR #8      // r12 ~= -dx/2^8
+  MUL     r3, r12, r14          // r3  ~= -dx^2/2^8
+  MOV     r14, r14, LSL #9      // r14  = 2^9 x
+  ADD     r14, r14, r3, ASR #15 // r14 ~= 2^9 x - dx^2 / 2^23
+
+  // Now r14 is a 16-bit approximation to the reciprocal of the input mantissa,
+  // scaled by 2^39 (so that the min mantissa 2^23 would have reciprocal 2^16
+  // in principle, and the max mantissa 2^24-1 would have reciprocal just over
+  // 2^15). The error is always negative (r14 is an underestimate of the true
+  // value), and the maximum error is 6 and a bit ULP (that is, the true
+  // reciprocal is strictly less than (r14+7)). Also, r14 is always strictly
+  // less than 0x10000 (even in the case of the min mantissa, where the true
+  // value would be _exactly_ 0x10000), which eliminates a case of integer
+  // overflow.
+  //
+  // All of these properties of the reciprocal approximation are checked by
+  // exhaustively iterating over all 2^23 possible input mantissas. (The nice
+  // thing about doing this in single rather than double precision!)
+  //
+  // Now we extract most of the quotient by two steps of long division, using
+  // the reciprocal estimate to identify a multiple of the denominator to
+  // subtract from the numerator. To avoid integer overflow, the numerator
+  // mantissa is shifted down 8 bits so that it's less than 0x10000. After we
+  // calculate an approximate quotient, we shift the numerator left and
+  // subtract that multiple of the denominator, moving the next portion of the
+  // numerator into range for the next iteration.
+
+  // First iteration of long division. We shift the numerator left 11 bits, and
+  // since the quotient approximation is scaled by 2^31, we must shift that
+  // right by 20 to make the right product to subtract from the numerator.
+  MOV     r12, r1, LSR #8       // shift the numerator down
+  MUL     r12, r14, r12         // make the quotient approximation
+  MOV     r1, r1, LSL #11       // shift numerator left, ready for subtraction
+  MOV     r3, r12, LSR #20      // make first 12-bit block of quotient bits
+  MLS     r1, r0, r3, r1        // subtract that multiple of den from num
+
+  ADD     r2, r2, #0x7D0000     //   exponent setup [#3] in the MLS delay slot
+
+  // Second iteration of long division. Differences from the first step: this
+  // time we shift the numerator 12 bits instead of 11, so that the total of
+  // both steps is 23 bits, i.e. we've shifted up by exactly the full width of
+  // the output mantissa. Also, the block of output quotient bits is left in a
+  // different register: it was in r3 the first time, and this time it's in
+  // r12, so that we still have both available at the end of the process.
+  MOV     r12, r1, LSR #8       // shift the numerator down
+  MUL     r12, r14, r12         // make the quotient approximation
+  MOV     r1, r1, LSL #12       // shift numerator left, ready for subtraction
+  MOV     r12, r12, LSR #19     // make second 11-bit block of quotient
+  MLS     r1, r0, r12, r1       // subtract that multiple of den from num
+
+  ADC     r2, r2, r2, ASR #16   //   exponent setup [#4] in the MLS delay slot
+
+  // Now r1 contains the original numerator, shifted left 23, minus _some_
+  // multiple of the original denominator (which is still in r0). The bounds on
+  // the error in the above steps should make the error at most 1: that is, we
+  // may have to subtract the denominator one more time to make r1 < r0, and
+  // increment the quotient by one more.
+  //
+  // Our quotient is still in two pieces, computed separately in the above long
+  // division steps. We fold the final increment into the same instruction that
+  // recombines them, by doing the comparison in such a way that it sets the
+  // carry flag if the increment is needed.
+
+  CMP     r1, r0                // Set carry flag if num >= den
+  SUBHS   r1, r1, r0            // If so, subtract den from num
+  ADC     r3, r12, r3, LSL #12  // Recombine quotient halves, plus optional +1
+
+  // We've finished with r14 as a temporary register, so we can unstack it now.
+  POP     {r14}
+
+  // Now r3 contains the _rounded-down_ output quotient, and r1 contains the
+  // remainder. That is, (denominator * r3 + r1) = (numerator << 23), and
+  // 0 <= r1 < denominator.
+  //
+  // Next we must round to nearest, by checking if r1 is greater than half the
+  // denominator. In division, it's not possible to hit an exact round-to-even
+  // halfway case, so we don't need to spend any time checking for it.
+  //
+  // Proof of no round-to-even: define the 'width' of a dyadic rational to be
+  // the distance between the lowest and highest 1 bits in its binary
+  // representation, or equivalently, the index of its high bit if you scale it
+  // by a power of 2 to make it an odd integer. E.g. any actual power of 2 has
+  // width 0, and all of 0b11110, 0b1111, 0b11.11 and 0b0.01111 have width 3.
+  // Then for any dyadic rationals a,b, width(ab) >= width(a)+width(b). Let w
+  // be the maximum width that the input precision supports (so that for single
+  // precision, w=23). Then if some division n/d were a round-to-even case, the
+  // true quotient q=n/d would have width exactly w+1. But we have qd=n, so
+  // width(n) >= width(q)+width(d) > w, which can't happen, because n is in the
+  // input precision, hence had width <= w.)
+  //
+  // So we don't need to check for an exact _halfway_ case and clear the low
+  // bit of the quotient after rounding up, as addition and multiplication both
+  // need to do. But we do need to remember if the quotient itself was exact,
+  // that is, if there was no remainder at all. That's needed in underflow
+  // handling.
+
+  // The rounding check wants to compare remainder with denominator/2. But of
+  // course in integers it's easier to compare 2*remainder with denominator. So
+  // we start by shifting the remainder left by 1, and in the process, set Z if
+  // it's exactly 0 (i.e. the result needs no rounding at all).
+  LSLS    r1, r1, #1
+  // Now trial-subtract the denominator. We don't do this at all if the result
+  // was exact. If we do do it, r1 goes negative precisely if we need to round
+  // up, which sets the C flag. (The previous instruction will have left C
+  // clear, since r1 had its top 8 bits all clear. So now C is set _only_ if
+  // we're rounding up.)
+  SUBSNE  r1, r1, r0
+  // Recombine the quotient with the sign + exponent, and use the C flag from
+  // the previous instruction to increment the quotient if we're rounding up.
+  ADC     r0, r3, r2, LSL #23
+
+  // If we haven't either overflowed or underflowed, we're done. We can
+  // identify most of the safe cases by doing an unsigned comparison of the
+  // initial output exponent (in the top half of r2) with 0xFC: if 0 <= r2 <
+  // 0xFC0000 then we have neither underflow nor overflow.
+  //
+  // Rationale: the value in the top half of r2 had three chances to be
+  // incremented before becoming the exponent field of the actual output float.
+  // It was incremented if we found the numerator mantissa was >= the
+  // denominator (producing the value in the _bottom_ half of r2, which we just
+  // ADCed into the output). Then it gets unconditionally incremented again
+  // when the ADC combines it with the leading mantissa bit. And finally,
+  // round-up might increment it a third time. So 0xFC is the smallest value
+  // that can possibly turn into the overflowed value 0xFF after all those
+  // increments.
+  //
+  // On the underflow side, (top half of r2) = 0 corresponds to a value of 1 in
+  // the final result's exponent field (and then rounding might increase it
+  // further); if the exponent was less than that then r2 wraps round and looks
+  // like a very large positive integer from the point of view of this unsigned
+  // comparison.
+  CMP     r2, #0xFC0000
+  BXLO    lr
+
+  // The same comparison will have set the N and V flags to reflect the result
+  // of comparing r2 with 0xFC0000 as a _signed_ integer. That reliably
+  // distinguishes potential underflow (r2 is negative) from potential overflow
+  // (r2 is positive and at least 0xFC0000)
+  BGE     LOCAL_LABEL(overflow)
+
+  // Here we might or might not have underflow (but we know we don't have
+  // overflow). To check more carefully, we look at the _bottom_ half of r2,
+  // which contains the exponent after the first adjustment (for num >= denom),
+  // That is, it's still off by 1 (compensating for the leading quotient bit),
+  // and is also before rounding.
+  //
+  // We neglect the effect of rounding: division results that are tiny (less
+  // than the smallest normalised number) before rounding, but then round up to
+  // the smallest normal number, are an acceptable edge case to handle slowly.
+  // We pass those to funder without worrying about them.
+  //
+  // So we want to check whether the bottom half of r2 was negative. It would
+  // be nice to check bits 8-15 of it, but unfortunately, it's already been
+  // combined with the sign (at bit 8), so those bits don't tell us anything
+  // useful. Instead we look at the top 4 bits of the exponent field, i.e. the
+  // 0xF0 bits. The largest _non_-overflowing exponent that might reach here is
+  // less than 3, so it doesn't reach those bits; the smallest possible
+  // underflow, obtained by dividing the smallest denormal by the largest
+  // finite number, is -151 (before the leading bit increments it), which will
+  // set the low 8 bits of r2 to 0x69. That is, the 0xF0 nibble of r2 will be
+  // 0x60 or greater for a (pre-rounding) underflow, and zero for a
+  // non-underflow.
+
+  TST     r2, #0xF0
+  BXEQ    lr                    // no underflow after all; return
+
+  // Rebias the exponent for funder, which also corrects the sign bit.
+  ADD     r0, r0, #192 << 23
+  // Tell funder whether the true value is greater or less than the number in
+  // r0. This is obtained from the sign of the remainder (still in r1), with
+  // the only problem being that it's currently reversed. So negate r1 (leaving
+  // 0 at 0 to indicate exactness).
+  RSBS    r1, r1, #0
+  B     SYMBOL_NAME(__compiler_rt_funder)
+
+LOCAL_LABEL(overflow):
+  // Here we might or might not have overflow (but we know we don't have
+  // underflow). We must check whether we really have overflowed.
+  //
+  // For this it's easiest to check the exponent field in the actual output
+  // value in r0, after _all_ the adjustments have been completed. The largest
+  // overflowed exponent is 0x193, and the smallest exponent that can reach
+  // this is 0xFD (we checked against 0xFC above, but then the leading quotient
+  // bit incremented it). So it's enough to shift the output left by one
+  // (moving the exponent field to the top), increment it once more (so that
+  // the smallest overflowed exponent 0xFF wraps round to 0), and then compare
+  // against 0xFE000000 as an unsigned integer.
+  MOV     r12, r0, LSL #1
+  ADD     r12, r12, #1 << 24
+  CMP     r12, #0xFE << 24      // Check for exp = 253 or 254
+  BXHS    lr
+  // We have actual overflow. Rebias r0 to bring the exponent back into range,
+  // which ensures its sign is correct. Then make an infinity of that sign to
+  // return.
+  SUBS    r0, r0, #0xC0 << 23
+  MOVS    r12, #0xFF            // exponent of infinity
+  ORRS    r12, r12, r0, LSR #23 // exponent and sign at bottom of r12
+  MOVS    r0, r12, LSL #23      // shift it up to the top of r0 to return
+  BX      lr
+
+LOCAL_LABEL(uncommon):
+  // We come here from the start of the function if either input is an uncommon
+  // value: zero, denormal, infinity or NaN.
+  //
+  // We arrive here with r12 = 0xFF000000, and r2 containing the exponent of x
+  // in bits 16..23. But r3 doesn't necessarily contain the exponent of y,
+  // because the instruction that set it up was conditional. So first we
+  // unconditionally repeat it.
+  AND     r3, r12, r1, LSR #7
+
+  // In all cases not involving a NaN as output, the sign of the output is made
+  // in the same way as for finite numbers, as the XOR of the input signs. So
+  // repeat the sign setup from the main branch.
+  TEQ     r0, r1                // is the output sign bit 1?
+  ORRMI   r2, r2, #0x100        // if so, set bit 8 of r2
+
+  // Detect infinities and NaNs, by checking if either of r2 or r3 is at least
+  // 0xFF0000.
+  CMP     r2, #0xFF0000
+  CMPLO   r3, #0xFF0000
+  BHS     LOCAL_LABEL(inf_NaN)
+
+  // Now we know there are no infinities or NaNs, but there's at least one zero
+  // or denormal.
+  MOVS    r12, r1, LSL #1       // is y zero?
+  BEQ     LOCAL_LABEL(divbyzero)        // if so, go and handle division by zero
+  MOVS    r12, r0, LSL #1       // is x zero? (now we know that y is not)
+  MOVEQ   r0, r2, LSL #23       // if so, 0/nonzero is just 0 (of right sign)
+  BXEQ    lr
+
+  // Now we've eliminated zeroes as well, leaving only denormals: either x or
+  // y, or both, is a denormal. Call fnorm2 to convert both into a normalised
+  // mantissa and a (potentially small) exponent.
+  AND     r12, r2, #0x100       // save the result sign from r2
+  LSR     r2, #16               // shift extracted exponents down to bit 0
+  LSR     r3, #16               // where fnorm2 will expect them
+  PUSH    {r0, r1, r2, r3, r12, lr}
+  MOV     r0, sp                // tell fnorm2 where to find its data
+  BL      SYMBOL_NAME(__compiler_rt_fnorm2)
+  POP     {r0, r1, r2, r3, r12, lr}
+  LSL     r3, #16               // shift exponents back up to bit 16
+  ORR     r2, r12, r2, LSL #16  // and put the result sign back in r2
+
+  // Now rejoin the main code path, having finished the setup it will expect:
+  // swap x and y, and shift the fractions back down to the low 24 bits.
+  MOV     r12, r0, LSR #8
+  MOV     r0, r1, LSR #8
+  MOV     r1, r12
+  B       LOCAL_LABEL(div)
+
+LOCAL_LABEL(inf_NaN):
+  // We come here if at least one input is a NaN or infinity. If either or both
+  // inputs are NaN then we hand off to fnan2 to propagate a NaN from the
+  // input.
+  MOV     r12, #0xFF000000
+  CMP     r12, r0, LSL #1       // if (r0 << 1) > 0xFF000000, r0 is a NaN
+  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+  CMP     r12, r1, LSL #1
+  BLO     SYMBOL_NAME(__compiler_rt_fnan2)
+
+  // No NaNs, so we have three options: inf/inf = NaN, inf/finite = inf, and
+  // finite/inf = 0.
+
+  // If both operands are infinity, we return a NaN. Since we know at
+  // least _one_ is infinity, we can test this by checking if they're
+  // equal apart from the sign bits.
+  EOR     r3, r0, r1
+  LSLS    r3, #1                // were all bits of XOR zero other than top?
+  BEQ     LOCAL_LABEL(invalid)          // if so, both operands are infinity
+
+  // See if x is infinite
+  CMP     r12, r0, LSL #1       // (r0 << 1) == 0xFF000000?
+  BEQ     LOCAL_LABEL(infret)           // if so, infinity/finite = infinity
+
+  // y is infinite and x is not, so we return a zero of the
+  // combined sign.
+  EOR     r0, r0, r1            // calculate the right sign
+  AND     r0, r0, #0x80000000   // throw away everything else
+  BX      lr
+
+LOCAL_LABEL(divbyzero):
+  // Here, we know y is zero. But we don't know if x is zero or nonzero. So we
+  // might be calculating 0/0 (invalid operation, generating a NaN), or
+  // nonzero/0 (the IEEE "division by zero" exception, generating infinity).
+  MOVS    r12, r0, LSL #1       // is x zero too?
+  BEQ     LOCAL_LABEL(invalid)          // if so, go and return a NaN
+
+LOCAL_LABEL(infret):
+  // Here, we're either dividing infinity by a finite number, or dividing a
+  // nonzero number by 0. (Or both, if we're dividing infinity by 0.) In all
+  // these cases we return infinity with the sign from r2.
+  //
+  // If we were implementing IEEE exceptions, we'd have to separate these
+  // cases: infinity / finite is not an _exception_, it just returns infinity,
+  // whereas (finite and nonzero) / 0 is a division-by-zero exception. But here
+  // we're not implementing exceptions, so we can treat all three cases the
+  // same.
+  //
+  // r2 contains the output sign in bit 8, which is a convenient place to find
+  // it when making an infinity, because we can fill in the 8 exponent bits
+  // below that and then shift it left.
+  ORR     r2, r2, #0xff         // sign + maximum exponent
+  LSL     r0, r2, #23           // shift up to the top
+  BX      lr
+
+LOCAL_LABEL(invalid):
+  // Return the default NaN, from an invalid operation (either dividing
+  // infinity by infinity, or 0 by 0).
+  LDR     r0, =0x7FC00000
+  BX      lr
+
+// Finally, the lookup table for the initial reciprocal approximation.
+//
+// The table index is made from the top 7 bits of the denominator mantissa. But
+// the topmost bit is always 1, so only the other 6 bits vary. So it only has
+// 64 entries, not 128.
+//
+// Each table entry is a single byte, with its top bit set. So the table
+// entries correspond to the reciprocal of a 7-bit mantissa prefix scaled up by
+// 2^14, or the reciprocal of a whole 24-bit mantissa scaled up by 2^31.
+//
+// Each of these 64 entries corresponds to a large interval of possible
+// mantissas. For example, if the top 7 bits are 1000001 then the overall
+// mantissa could be anything from 0x820000 to 0x83FFFF. And because the output
+// of this table provides more bits than the input, there are several choices
+// of 8-bit reciprocal approximation for a number in that interval. The
+// reciprocal of 0x820000 starts with 0xFC plus a fraction, and the reciprocal
+// of 0x83FFFF starts with 0xF9 minus a fraction, so there are four reasonable
+// choices for that table entry: F9, FA, FB or FC. Which do we pick?
+//
+// The table below is generated by choosing whichever value minimises the
+// maximum possible error _after_ the approximation is improved by the
+// Newton-Raphson step. In the example above, we end up with FA.
+//
+// The Python code below will regenerate the table, complete with the per-entry
+// comments.
+
+/*
+
+for prefix in range(64, 128):
+    best = None
+
+    # Max and min 23-bit mantissas with this 7-bit prefix
+    mmin, mmax = prefix * 2**17, (prefix + 1) * 2**17 - 1
+
+    # Max and min table entry corresponding to the reciprocal of something in
+    # that range of mantissas: round up the reciprocal of mmax, and round down
+    # the reciprocal of mmin. Also clamp to the range [0x80,0xff], because
+    # 0x100 can't be used as a table entry due to not fitting in a byte, even
+    # though it's the exact reciprocal of the overall-smallest mantissa
+    # 0x800000.
+    gmin = max(128, (2**31 + mmin - 1) // mmax)
+    gmax = min(255, 2**31 // mmin)
+
+    # For each of those table entries, compute the result of starting from that
+    # value and doing a Newton-Raphson iteration, with the mantissa at each end
+    # of the mantissa interval. One of these will be the worst possible error.
+    # Choose the table entry whose worst error is as small as possible.
+    #
+    # (To find the extreme values of a more general function on an interval,
+    # you must consider its values not only at the interval endpoints but also
+    # any turning points within the interval. Here, the function has only one
+    # turning point, and by construction it takes value 0 there, so we needn't
+    # worry.)
+    g = max(
+        range(gmin, gmax + 1),
+        key=lambda g: min(
+            (g * (2**32 - d * g) / 2**23 - 2**39 / d) for d in [mmin, mmax]
+        ),
+    )
+
+    print(f"  .byte 0x{g:02x}  // input [0x{mmin:06x},0x{mmax:06x}]"
+          f", candidate outputs [0x{gmin:02x},0x{gmax:02x}]"
+    )
+
+*/
+
+  .p2align 2  // make sure we start on a 32-bit boundary, even in Thumb
----------------
compnerd wrote:

I think that changing this to `4-byte boundary` is better than `32-bit boundary` as it can be confusing when scanning over the comment and code.

https://github.com/llvm/llvm-project/pull/161546