[libc-commits] [libc] [libc][math] Implement atan2f correctly rounded to all rounding modes. (PR #86716)

Fri Mar 29 09:14:42 PDT 2024

================
@@ -0,0 +1,299 @@
+//===-- Single-precision atan2f function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/atan2f.h"
+#include "inv_trigf_utils.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/double_double.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+
+namespace LIBC_NAMESPACE {
+
+namespace {
+
+// Look up tables for accurate pass:
+
+// atan(i/16) with i = 0..16, generated by Sollya with:
+// > for i from 0 to 16 do {
+//     a = round(atan(i/16), D, RN);
+//     b = round(atan(i/16) - a, D, RN);
+//     print("{", b, ",", a, "},");
+//   };
+constexpr fputil::DoubleDouble ATAN_I[17] = {
+    {0.0, 0.0},
+    {-0x1.c934d86d23f1dp-60, 0x1.ff55bb72cfdeap-5},
+    {-0x1.cd37686760c17p-59, 0x1.fd5ba9aac2f6ep-4},
+    {0x1.347b0b4f881cap-58, 0x1.7b97b4bce5b02p-3},
+    {0x1.8ab6e3cf7afbdp-57, 0x1.f5b75f92c80ddp-3},
+    {-0x1.963a544b672d8p-57, 0x1.362773707ebccp-2},
+    {-0x1.c63aae6f6e918p-56, 0x1.6f61941e4def1p-2},
+    {-0x1.24dec1b50b7ffp-56, 0x1.a64eec3cc23fdp-2},
+    {0x1.a2b7f222f65e2p-56, 0x1.dac670561bb4fp-2},
+    {-0x1.d5b495f6349e6p-56, 0x1.0657e94db30dp-1},
+    {-0x1.928df287a668fp-58, 0x1.1e00babdefeb4p-1},
+    {0x1.1021137c71102p-55, 0x1.345f01cce37bbp-1},
+    {0x1.2419a87f2a458p-56, 0x1.4978fa3269ee1p-1},
+    {0x1.0028e4bc5e7cap-57, 0x1.5d58987169b18p-1},
+    {-0x1.8c34d25aadef6p-56, 0x1.700a7c5784634p-1},
+    {-0x1.bf76229d3b917p-56, 0x1.819d0b7158a4dp-1},
+    {0x1.1a62633145c07p-55, 0x1.921fb54442d18p-1},
+};
+
+// Taylor polynomial, generated by Sollya with:
+// > for i from 0 to 8 do {
+//     j = (-1)^(i + 1)/(2*i + 1);
+//     a = round(j, D, RN);
+//     b = round(j - a, D, RN);
+//     print("{", b, ",", a, "},");
+//   };
+constexpr fputil::DoubleDouble COEFFS[9] = {
+    {0.0, 1.0},                                      // 1
+    {-0x1.5555555555555p-56, -0x1.5555555555555p-2}, // -1/3
+    {-0x1.999999999999ap-57, 0x1.999999999999ap-3},  // 1/5
+    {-0x1.2492492492492p-57, -0x1.2492492492492p-3}, // -1/7
+    {0x1.c71c71c71c71cp-58, 0x1.c71c71c71c71cp-4},   // 1/9
+    {0x1.745d1745d1746p-59, -0x1.745d1745d1746p-4},  // -1/11
+    {-0x1.3b13b13b13b14p-58, 0x1.3b13b13b13b14p-4},  // 1/13
+    {-0x1.1111111111111p-60, -0x1.1111111111111p-4}, // -1/15
+    {0x1.e1e1e1e1e1e1ep-61, 0x1.e1e1e1e1e1e1ep-5},   // 1/17
+};
+
+// Veltkamp's splitting of a double precision into hi + lo, where the hi part is
+// slightly smaller than an even split, so that the product of
+//   hi * s * k is exact,
+// where:
+//   s is single precsion,
+//   0 < k < 16 is an integer.
+// This is used when FMA instruction is not available.
+[[maybe_unused]] LIBC_INLINE constexpr fputil::DoubleDouble split_d(double a) {
----------------
nickdesaulniers wrote:

That's not the semantics of this keyword.  https://gist.github.com/nico/93df46db7f5e1d0bd941d63663db95b1

The `inline` keyword is much more watered down than its name implies.

Simply by putting it in an anonymous namespace outside of a header (so that it's no longer externally visible) and having only one call site (and enabling any optimization level), the inline cost model (in LLVM) will provide an near-infinite discount to the "cost" of inlining.

https://godbolt.org/z/c53vb4KEr

Once inlined, the definition no longer has any callers and is dead code eliminated.

You can verify this via:

```diff

diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp
index d50ac6608836..c8e943d13466 100644
--- a/libc/src/math/generic/atan2f.cpp
+++ b/libc/src/math/generic/atan2f.cpp
@@ -81,7 +81,7 @@ constexpr fputil::DoubleDouble COEFFS[9] = {
 //                     = 33.
 // Thus, the Veltkamp splitting constant is C = 2^33 + 1.
 // This is used when FMA instruction is not available.
-[[maybe_unused]] LIBC_INLINE constexpr fputil::DoubleDouble split_d(double a) {
+[[maybe_unused]] constexpr fputil::DoubleDouble split_d(double a) {
   fputil::DoubleDouble r{0.0, 0.0};
   constexpr double C = 0x1.0p33 + 1.0;
   double t1 = C * a;
@@ -98,7 +98,7 @@ constexpr fputil::DoubleDouble COEFFS[9] = {
 //   idx, k_d   = round( 2^4 * num_d / den_d )
 //   final_sign = sign of the final result
 //   const_term = the constant term in the final expression.
-LIBC_INLINE float atan2f_double_double(double num_d, double den_d, double q_d,
+float atan2f_double_double(double num_d, double den_d, double q_d,
                                        int idx, double k_d, double final_sign,
                                        const fputil::DoubleDouble &const_term) {
   fputil::DoubleDouble q;
```
```
$ ninja libc
$ llvm-nm ./libc/src/math/generic/CMakeFiles/libc.src.math.generic.atan2f.dir/atan2f.cpp.o | llvm-cxxfilt | grep -e split_d -e atan2f_double_double
$ llvm-objdump -dr ./libc/src/math/generic/CMakeFiles/libc.src.math.generic.atan2f.dir/atan2f.cpp.o | llvm-cxxfilt | grep call -A1
     2a6: e8 00 00 00 00               	callq	0x2ab <atan2f+0x2ab>
< call to polyeval >
$
```

---

If you want stronger guarantees wrt. inline substitution, you must use the function attributes always_inline or noinline.  Those are heavy handed, and I don't recommend using them until one has benchmarks or disassembly showing they're not getting inlining and need it. (I've seen codebases put always_inline on everything, at the extreme cost to compile time and resulting code size).  They should not be necessary here.

`inline` has interesting semantics when used with `static` or `extern` on definitions in a header. There's also a function attribute `gnu_inline`, changes to `extern inline` in C99 (and newer) modes, and a `-f` flag for `gnu_inline`/C90 `extern inline` semantics.

https://github.com/llvm/llvm-project/pull/86716