[libc-commits] [libc] a629621 - [libc][math] Improve atanf performance. (#85463)

Mon Mar 18 18:10:55 PDT 2024

Author: lntue
Date: 2024-03-18T21:10:50-04:00
New Revision: a629621454838207ca49bcfd773d5aa0671659f1

URL: https://github.com/llvm/llvm-project/commit/a629621454838207ca49bcfd773d5aa0671659f1
DIFF: https://github.com/llvm/llvm-project/commit/a629621454838207ca49bcfd773d5aa0671659f1.diff

LOG: [libc][math] Improve atanf performance. (#85463)

Simplify the range reduction logic and computations. Performance test
using `perf.sh` from CORE-MATH project on Ryzen 5900X:

Before:
```
$ ./perf.sh atanf --rdtsc --path1
LIBC-location: /home/lnt/experiment/llvm-project/build/projects/libc/lib/libllvmlibc.a
GNU libc version: 2.35
GNU libc release: stable
-- CORE-MATH reciprocal throughput --
[####################] 100 %
Ntrial = 20 ; Min = 14.369 + 0.556 clc/call; Median-Min = 0.613 clc/call; Max = 15.061 clc/call;
-- System LIBC reciprocal throughput --
[####################] 100 %
Ntrial = 20 ; Min = 26.180 + 0.015 clc/call; Median-Min = 0.014 clc/call; Max = 26.260 clc/call;
-- LIBC reciprocal throughput --
[####################] 100 %
Ntrial = 20 ; Min = 21.237 + 0.022 clc/call; Median-Min = 0.020 clc/call; Max = 21.272 clc/call;

$ ./perf.sh atanf --rdtsc --path1 --latency
LIBC-location: /home/lnt/experiment/llvm-project/build/projects/libc/lib/libllvmlibc.a
GNU libc version: 2.35
GNU libc release: stable
-- CORE-MATH latency --
[####################] 100 %
Ntrial = 20 ; Min = 50.505 + 0.045 clc/call; Median-Min = 0.037 clc/call; Max = 50.579 clc/call;
-- System LIBC latency --
[####################] 100 %
Ntrial = 20 ; Min = 62.438 + 0.836 clc/call; Median-Min = 0.049 clc/call; Max = 64.498 clc/call;
-- LIBC latency --
[####################] 100 %
Ntrial = 20 ; Min = 67.781 + 0.546 clc/call; Median-Min = 0.028 clc/call; Max = 68.844 clc/call;
```

After:
```
$ ./perf.sh atanf --rdtsc --path2
LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a
GNU libc version: 2.35
GNU libc release: stable
-- CORE-MATH reciprocal throughput --
[####################] 100 %
Ntrial = 20 ; Min = 14.400 + 0.353 clc/call; Median-Min = 0.404 clc/call; Max = 14.863 clc/call;
-- System LIBC reciprocal throughput --
[####################] 100 %
Ntrial = 20 ; Min = 25.247 + 0.783 clc/call; Median-Min = 0.714 clc/call; Max = 26.238 clc/call;
-- LIBC reciprocal throughput --
[####################] 100 %
Ntrial = 20 ; Min = 13.751 + 0.158 clc/call; Median-Min = 0.140 clc/call; Max = 14.006 clc/call;

$ ./perf.sh atanf --rdtsc --path2 --latency
LIBC-location: /home/lnt/experiment/llvm/llvm-project/build/projects/libc/lib/libllvmlibc.a
GNU libc version: 2.35
GNU libc release: stable
-- CORE-MATH latency --
[####################] 100 %
Ntrial = 20 ; Min = 51.837 + 0.073 clc/call; Median-Min = 0.058 clc/call; Max = 52.000 clc/call;
-- System LIBC latency --
[####################] 100 %
Ntrial = 20 ; Min = 62.487 + 1.878 clc/call; Median-Min = 1.965 clc/call; Max = 64.569 clc/call;
OK
-- LIBC latency --
[####################] 100 %
Ntrial = 20 ; Min = 55.414 + 1.312 clc/call; Median-Min = 0.345 clc/call; Max = 58.362 clc/call;
```

Added: 
    

Modified: 
    libc/src/math/generic/CMakeLists.txt
    libc/src/math/generic/atanf.cpp
    libc/src/math/generic/inv_trigf_utils.cpp
    libc/src/math/generic/inv_trigf_utils.h
    libc/test/src/math/CMakeLists.txt
    libc/test/src/math/atanf_test.cpp
    libc/test/src/math/exhaustive/atanf_test.cpp
    utils/bazel/llvm-project-overlay/libc/BUILD.bazel

Removed: 
    libc/test/src/math/inv_trigf_utils_test.cpp


################################################################################
diff  --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b0a35c652cd587..176c32e47768df 100644

--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -2155,16 +2155,9 @@ add_object_library(
   SRCS
     inv_trigf_utils.cpp
   DEPENDS
-    .math_utils
-    libc.src.__support.FPUtil.fp_bits
-    libc.src.__support.FPUtil.fenv_impl
-    libc.src.__support.FPUtil.nearest_integer
-    libc.src.__support.FPUtil.nearest_integer_operations
+    libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.polyeval
     libc.src.__support.common
-    libc.include.errno
-    libc.src.errno.errno
-    libc.include.math
 )
 
 add_entrypoint_object(
@@ -2211,8 +2204,11 @@ add_entrypoint_object(
     ../atanf.h
   DEPENDS
     .inv_trigf_utils
-    .math_utils
+    libc.src.__support.FPUtil.except_value_utils
     libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
     libc.src.__support.FPUtil.rounding_mode
     libc.src.__support.macros.optimization
   COMPILE_OPTIONS

diff  --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index e0f8a1bfc2ecc9..69fd45ddd767e5 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -7,11 +7,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/math/atanf.h"
-#include "math_utils.h"
+#include "inv_trigf_utils.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/FPUtil/rounding_mode.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/math/generic/inv_trigf_utils.h"
 
 namespace LIBC_NAMESPACE {
 
@@ -19,48 +22,100 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
   using FPBits = typename fputil::FPBits<float>;
   using Sign = fputil::Sign;
 
-  // x == 0.0
-  if (LIBC_UNLIKELY(x == 0.0f))
-    return x;
+  constexpr double FINAL_SIGN[2] = {1.0, -1.0};
+  constexpr double SIGNED_PI_OVER_2[2] = {0x1.921fb54442d18p0,
+                                          -0x1.921fb54442d18p0};
 
-  FPBits xbits(x);
-  Sign sign = xbits.sign();
-  xbits.set_sign(Sign::POS);
+  FPBits x_bits(x);
+  Sign sign = x_bits.sign();
+  x_bits.set_sign(Sign::POS);
+  uint32_t x_abs = x_bits.uintval();
 
-  if (LIBC_UNLIKELY(xbits.is_inf_or_nan())) {
-    if (xbits.is_inf())
-      return static_cast<float>(
-          opt_barrier(sign.is_neg() ? -M_MATH_PI_2 : M_MATH_PI_2));
-    else
+  // x is inf or nan, |x| < 2^-4 or |x|= > 16.
+  if (LIBC_UNLIKELY(x_abs <= 0x3d80'0000U || x_abs >= 0x4180'0000U)) {
+    double x_d = static_cast<double>(x);
+    double const_term = 0.0;
+    if (LIBC_UNLIKELY(x_abs >= 0x4180'0000)) {
+      // atan(+-Inf) = +-pi/2.
+      if (x_bits.is_inf())
+        return static_cast<float>(SIGNED_PI_OVER_2[sign.is_neg()]);
+      if (x_bits.is_nan())
+        return x;
+      // x >= 16
+      x_d = -1.0 / x_d;
+      const_term = SIGNED_PI_OVER_2[sign.is_neg()];
+    }
+    // 0 <= x < 1/16;
+    if (LIBC_UNLIKELY(x_bits.is_zero()))
       return x;
-  }
-  // |x| == 0.06905200332403183
-  if (LIBC_UNLIKELY(xbits.uintval() == 0x3d8d6b23U)) {
-    if (fputil::fenv_is_round_to_nearest()) {
-      // 0.06894256919622421
-      FPBits br(0x3d8d31c3U);
-      br.set_sign(sign);
-      return br.get_val();
+    // x <= 2^-12;
+    if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
+#if defined(LIBC_TARGET_CPU_HAS_FMA)
+      return fputil::multiply_add(x, -0x1.0p-25f, x);
+#else
+      double x_d = static_cast<double>(x);
+      return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
+#endif // LIBC_TARGET_CPU_HAS_FMA
     }
+    // Use Taylor polynomial:
+    //   atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
+    double x2 = x_d * x_d;
+    double x4 = x2 * x2;
+    double c0 = fputil::multiply_add(x2, ATAN_COEFFS[0][1], ATAN_COEFFS[0][0]);
+    double c1 = fputil::multiply_add(x2, ATAN_COEFFS[0][3], ATAN_COEFFS[0][2]);
+    double c2 = fputil::multiply_add(x2, ATAN_COEFFS[0][5], ATAN_COEFFS[0][4]);
+    double p = fputil::polyeval(x4, c0, c1, c2);
+    double r = fputil::multiply_add(x_d, p, const_term);
+    return static_cast<float>(r);
   }
 
-  // |x| == 1.8670953512191772
-  if (LIBC_UNLIKELY(xbits.uintval() == 0x3feefcfbU)) {
-    int rounding_mode = fputil::quick_get_round();
-    if (sign.is_neg()) {
-      if (rounding_mode == FE_DOWNWARD) {
-        // -1.0790828466415405
-        return FPBits(0xbf8a1f63U).get_val();
-      }
-    } else {
-      if (rounding_mode == FE_UPWARD) {
-        // 1.0790828466415405
-        return FPBits(0x3f8a1f63U).get_val();
-      }
+  // Range reduction steps:
+  // 1)  atan(x) = sign(x) * atan(|x|)
+  // 2)  If |x| > 1, atan(|x|) = pi/2 - atan(1/|x|)
+  // 3)  For 1/16 < x <= 1, we find k such that: |x - k/16| <= 1/32.
+  // 4)  Then we use polynomial approximation:
+  //   atan(x) ~ atan((k/16) + (x - (k/16)) * Q(x - k/16)
+  //           = P(x - k/16)
+  double x_d, const_term, final_sign;
+  int idx;
+
+  if (x_abs > 0x3f80'0000U) {
+    // Exceptional value:
+    if (LIBC_UNLIKELY(x_abs == 0x3ffe'2ec1U)) { // |x| = 0x1.fc5d82p+0
+      return sign.is_pos() ? fputil::round_result_slightly_up(0x1.1ab2fp0f)
+                           : fputil::round_result_slightly_down(-0x1.1ab2fp0f);
     }
+    // |x| > 1, we need to invert x, so we will perform range reduction in
+    // double precision.
+    x_d = 1.0 / static_cast<double>(x_bits.get_val());
+    double k_d = fputil::nearest_integer(x_d * 0x1.0p4);
+    x_d = fputil::multiply_add(k_d, -0x1.0p-4, x_d);
+    idx = static_cast<int>(k_d);
+    final_sign = FINAL_SIGN[sign.is_pos()];
+    // Adjust constant term of the polynomial by +- pi/2.
+    const_term = fputil::multiply_add(final_sign, ATAN_COEFFS[idx][0],
+                                      SIGNED_PI_OVER_2[sign.is_neg()]);
+  } else {
+    // Exceptional value:
+    if (LIBC_UNLIKELY(x_abs == 0x3dbb'6ac7U)) { // |x| = 0x1.76d58ep-4
+      return sign.is_pos()
+                 ? fputil::round_result_slightly_up(0x1.75cb06p-4f)
+                 : fputil::round_result_slightly_down(-0x1.75cb06p-4f);
+    }
+    // Perform range reduction in single precision.
+    float x_f = x_bits.get_val();
+    float k_f = fputil::nearest_integer(x_f * 0x1.0p4f);
+    x_f = fputil::multiply_add(k_f, -0x1.0p-4f, x_f);
+    x_d = static_cast<double>(x_f);
+    idx = static_cast<int>(k_f);
+    final_sign = FINAL_SIGN[sign.is_neg()];
+    const_term = final_sign * ATAN_COEFFS[idx][0];
   }
 
-  return static_cast<float>(atan_eval(x));
+  double p = atan_eval(x_d, idx);
+  double r = fputil::multiply_add(final_sign * x_d, p, const_term);
+
+  return static_cast<float>(r);
 }
 
 } // namespace LIBC_NAMESPACE

diff  --git a/libc/src/math/generic/inv_trigf_utils.cpp b/libc/src/math/generic/inv_trigf_utils.cpp
index 8013c0470affb1..93d5bcbf7b567d 100644
--- a/libc/src/math/generic/inv_trigf_utils.cpp
+++ b/libc/src/math/generic/inv_trigf_utils.cpp
@@ -10,19 +10,71 @@
 
 namespace LIBC_NAMESPACE {
 
-// N[Table[ArcTan[x], {x, 1/16, 16/16, 1/16}], 40]
-alignas(64) const double ATAN_T[ATAN_T_SIZE] = {
-    0x1.ff55bb72cfdeap-5, 0x1.fd5ba9aac2f6ep-4, 0x1.7b97b4bce5b02p-3,
-    0x1.f5b75f92c80ddp-3, 0x1.362773707ebccp-2, 0x1.6f61941e4def1p-2,
-    0x1.a64eec3cc23fdp-2, 0x1.dac670561bb4fp-2, 0x1.0657e94db30d0p-1,
-    0x1.1e00babdefeb4p-1, 0x1.345f01cce37bbp-1, 0x1.4978fa3269ee1p-1,
-    0x1.5d58987169b18p-1, 0x1.700a7c5784634p-1, 0x1.819d0b7158a4dp-1,
-    0x1.921fb54442d18p-1};
-
-// for(int i = 0; i < 5; i++)
-//     printf("%.13a,\n", (-2 * (i % 2) + 1) * 1.0 / (2 * i + 1));
-alignas(64) const double ATAN_K[5] = {
-    0x1.0000000000000p+0, -0x1.5555555555555p-2, 0x1.999999999999ap-3,
-    -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4};
+// Polynomial approximation for 0 <= x <= 1:
+//   atan(x) ~ atan((i/16) + (x - (i/16)) * Q(x - i/16)
+//           = P(x - i/16)
+// Generated by Sollya with:
+// > for i from 1 to 16 do {
+//     mid_point = i/16;
+//     P = fpminimax(atan(mid_point + x), 7, [|D...|], [-1/32, 1/32]);
+//     print("{", coeff(P, 0), ",", coeff(P, 1), ",", coeff(P, 2), ",",
+//           coeff(P, 3), ",", coeff(P, 4), ",", coeff(P, 5), ",", coeff(P, 6),
+//           ",", coeff(P, 7), "},");
+//   };
+// For i = 0, ATAN_COEFFS[0][j] = (-1)^j * (1/(2*j + 1)) is the odd coefficients
+// of the Taylor polynomial of atan(x).
+double ATAN_COEFFS[17][8] = {
+    {0x1.0000000000000p+0, -0x1.5555555555555p-2, 0x1.999999999999ap-3,
+     -0x1.2492492492492p-3, 0x1.c71c71c71c71cp-4, -0x1.745d1745d1746p-4,
+     0x1.3b13b13b13b14p-4, -0x1.1111111111111p-4},
+    {0x1.ff55bb72cfdb1p-5, 0x1.fe01fe01fe1bp-1, -0x1.fc05f80821d1ap-5,
+     -0x1.4d6930419fc5fp-2, 0x1.f61b9f6d69313p-5, 0x1.8208a32f4346cp-3,
+     -0x1.ecb8fc53d04efp-5, -0x1.060710cb59cbcp-3},
+    {0x1.fd5ba9aac2f3cp-4, 0x1.f81f81f81f96ap-1, -0x1.f05e09cf4c1b2p-4,
+     -0x1.368c3aac7543ep-2, 0x1.d9b14bddfac55p-4, 0x1.4048e55ec725ep-3,
+     -0x1.b98ca3c1594b5p-4, -0x1.664eabaabbc16p-4},
+    {0x1.7b97b4bce5ae7p-3, 0x1.ee9c7f8458f06p-1, -0x1.665c226c8dc69p-3,
+     -0x1.1344bb77961b7p-2, 0x1.42ac97745d3ccp-3, 0x1.c32e142047ec1p-4,
+     -0x1.137ae41ab96cbp-3, -0x1.1a6ae8c09a4b6p-5},
+    {0x1.f5b75f92c80c6p-3, 0x1.e1e1e1e1e1ed4p-1, -0x1.c5894d101ad4p-3,
+     -0x1.ce6de02b38c38p-3, 0x1.78a3920c336b9p-3, 0x1.dd5ff94a9d499p-5,
+     -0x1.1ac2d3f9d072ep-3, 0x1.0af9735dff373p-6},
+    {0x1.362773707ebc5p-2, 0x1.d272ca3fc5b8bp-1, -0x1.0997e8ae90cb6p-2,
+     -0x1.6cf6667146798p-3, 0x1.8dd1dff17f3d3p-3, 0x1.24860eced656fp-7,
+     -0x1.f4220e8f18ed5p-4, 0x1.b700aed7cdc34p-5},
+    {0x1.6f61941e4deeep-2, 0x1.c0e070381c115p-1, -0x1.2726dd1347c7ep-2,
+     -0x1.09f37b3ad010dp-3, 0x1.85eaca5196f5cp-3, -0x1.04d640117852ap-5,
+     -0x1.802c2956871c7p-4, 0x1.2992b45df0ee7p-4},
+    {0x1.a64eec3cc23fep-2, 0x1.adbe87f94906bp-1, -0x1.3b9d8eab5eae5p-2,
+     -0x1.57c09646faabbp-4, 0x1.6795330e73aep-3, -0x1.f2d89a702a652p-5,
+     -0x1.f3afd90a9d4d7p-5, 0x1.3261723d3f153p-4},
+    {0x1.dac670561bb53p-2, 0x1.999999999998fp-1, -0x1.47ae147afd8cap-2,
+     -0x1.5d867c3dfd72ap-5, 0x1.3a92a76cba833p-3, -0x1.3ec460286928ap-4,
+     -0x1.ed02ff86892acp-6, 0x1.0a674c8f05727p-4},
+    {0x1.0657e94db30d2p-1, 0x1.84f00c27805ffp-1, -0x1.4c62cb564f677p-2,
+     -0x1.e6495b262dfe7p-8, 0x1.063c34eca262bp-3, -0x1.58b78dc79b5aep-4,
+     -0x1.4623815233be1p-8, 0x1.93afe94328089p-5},
+    {0x1.1e00babdefeb6p-1, 0x1.702e05c0b8159p-1, -0x1.4af2b78236bd6p-2,
+     0x1.5d0b7ea46ed08p-6, 0x1.a124870236935p-4, -0x1.519e1ec133a88p-4,
+     0x1.a54632a3f48c7p-7, 0x1.099ca0945096dp-5},
+    {0x1.345f01cce37bdp-1, 0x1.5babcc647fa7ep-1, -0x1.449db09443a67p-2,
+     0x1.655caac78a0fcp-5, 0x1.3bbbdb0d09efap-4, -0x1.34a306c27e021p-4,
+     0x1.83fe749c7966p-6, 0x1.2057cc96d9edcp-6},
+    {0x1.4978fa3269ee2p-1, 0x1.47ae147ae146bp-1, -0x1.3a92a305652e1p-2,
+     0x1.ec21b5172657fp-5, 0x1.c2f8c45d2f4eep-5, -0x1.0ba99c4aeb8acp-4,
+     0x1.d716a4af4d1d6p-6, 0x1.97fba0a9696dep-8},
+    {0x1.5d58987169b19p-1, 0x1.34679ace0133cp-1, -0x1.2ddfb03920e2fp-2,
+     0x1.2491307c0fa0bp-4, 0x1.29c7eca0136fp-5, -0x1.bca792caa6f1cp-5,
+     0x1.e5d92545576bcp-6, -0x1.8ca76fcf5ccd2p-10},
+    {0x1.700a7c5784634p-1, 0x1.21fb78121fb71p-1, -0x1.1f6a8499ea541p-2,
+     0x1.41b15e5e77bcfp-4, 0x1.59bc9bf54fb02p-6, -0x1.63b54ff058e0fp-5,
+     0x1.c8da01221306fp-6, -0x1.906b2c274c39cp-8},
+    {0x1.819d0b7158a4dp-1, 0x1.107fbbe01107cp-1, -0x1.0feeb40897d4ep-2,
+     0x1.50e5afb95f5d6p-4, 0x1.2a7c2f0c7495dp-7, -0x1.12bd2bb5062cdp-5,
+     0x1.93e8ceb89afebp-6, -0x1.10da9b8c6b731p-7},
+    {0x1.921fb54442d18p-1, 0x1.fffffffffffebp-2, -0x1.fffffffffcbbcp-3,
+     0x1.555555564e2fep-4, -0x1.20b17d5dd89dcp-30, -0x1.9999c5ad71711p-6,
+     0x1.5558b76e7aaf9p-6, -0x1.236e803c6c1f6p-7},
+};
 
 } // namespace LIBC_NAMESPACE

diff  --git a/libc/src/math/generic/inv_trigf_utils.h b/libc/src/math/generic/inv_trigf_utils.h
index 20f912de2ac008..308bdc44e3f8ac 100644
--- a/libc/src/math/generic/inv_trigf_utils.h
+++ b/libc/src/math/generic/inv_trigf_utils.h
@@ -9,85 +9,32 @@
 #ifndef LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
 #define LLVM_LIBC_SRC_MATH_GENERIC_INV_TRIGF_UTILS_H
 
-#include "math_utils.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
-#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/common.h"
 
-#include <errno.h>
-
 namespace LIBC_NAMESPACE {
 
 // PI and PI / 2
 constexpr double M_MATH_PI = 0x1.921fb54442d18p+1;
 constexpr double M_MATH_PI_2 = 0x1.921fb54442d18p+0;
 
-// atan table size
-constexpr int ATAN_T_BITS = 4;
-constexpr int ATAN_T_SIZE = 1 << ATAN_T_BITS;
-
-// N[Table[ArcTan[x], {x, 1/8, 8/8, 1/8}], 40]
-extern const double ATAN_T[ATAN_T_SIZE];
-extern const double ATAN_K[5];
-
-// The main idea of the function is to use formula
-// atan(u) + atan(v) = atan((u+v)/(1-uv))
-
-// x should be positive, normal finite value
-LIBC_INLINE double atan_eval(double x) {
-  using FPB = fputil::FPBits<double>;
-  using Sign = fputil::Sign;
-  // Added some small value to umin and umax mantissa to avoid possible rounding
-  // errors.
-  FPB::StorageType umin =
-      FPB::create_value(Sign::POS, FPB::EXP_BIAS - ATAN_T_BITS - 1,
-                        0x100000000000UL)
-          .uintval();
-  FPB::StorageType umax =
-      FPB::create_value(Sign::POS, FPB::EXP_BIAS + ATAN_T_BITS,
-                        0xF000000000000UL)
-          .uintval();
-
-  FPB bs(x);
-  auto x_abs = bs.abs().uintval();
-
-  if (x_abs <= umin) {
-    double pe = LIBC_NAMESPACE::fputil::polyeval(
-        x * x, 0.0, ATAN_K[1], ATAN_K[2], ATAN_K[3], ATAN_K[4]);
-    return fputil::multiply_add(pe, x, x);
-  }
-
-  if (x_abs >= umax) {
-    double one_over_x_m = -1.0 / x;
-    double one_over_x2 = one_over_x_m * one_over_x_m;
-    double pe = LIBC_NAMESPACE::fputil::polyeval(
-        one_over_x2, ATAN_K[0], ATAN_K[1], ATAN_K[2], ATAN_K[3]);
-    return fputil::multiply_add(pe, one_over_x_m,
-                                bs.is_neg() ? (-M_MATH_PI_2) : (M_MATH_PI_2));
-  }
+extern double ATAN_COEFFS[17][8];
 
-  double pos_x = FPB(x_abs).get_val();
-  bool one_over_x = pos_x > 1.0;
-  if (one_over_x) {
-    pos_x = 1.0 / pos_x;
-  }
+// For |x| <= 1/32 and 1 <= i <= 16, return Q(x) such that:
+//   Q(x) ~ (atan(x + i/16) - atan(i/16)) / x.
+LIBC_INLINE constexpr double atan_eval(double x, int i) {
+  double x2 = x * x;
 
-  double near_x = fputil::nearest_integer(pos_x * ATAN_T_SIZE);
-  int val = static_cast<int>(near_x);
-  near_x *= 1.0 / ATAN_T_SIZE;
+  double c0 = fputil::multiply_add(x, ATAN_COEFFS[i][2], ATAN_COEFFS[i][1]);
+  double c1 = fputil::multiply_add(x, ATAN_COEFFS[i][4], ATAN_COEFFS[i][3]);
+  double c2 = fputil::multiply_add(x, ATAN_COEFFS[i][6], ATAN_COEFFS[i][5]);
 
-  double v = (pos_x - near_x) / fputil::multiply_add(near_x, pos_x, 1.0);
-  double v2 = v * v;
-  double pe = LIBC_NAMESPACE::fputil::polyeval(v2, ATAN_K[0], ATAN_K[1],
-                                               ATAN_K[2], ATAN_K[3], ATAN_K[4]);
-  double result;
-  if (one_over_x)
-    result = M_MATH_PI_2 - fputil::multiply_add(pe, v, ATAN_T[val - 1]);
-  else
-    result = fputil::multiply_add(pe, v, ATAN_T[val - 1]);
-  return bs.is_neg() ? -result : result;
+  double x4 = x2 * x2;
+  double d1 = fputil::multiply_add(x2, c1, c0);
+  double d2 = fputil::multiply_add(x2, ATAN_COEFFS[i][7], c2);
+  double p = fputil::multiply_add(x4, d2, d1);
+  return p;
 }
 
 // > Q = fpminimax(asin(x)/x, [|0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20|],
@@ -99,7 +46,7 @@ constexpr double ASIN_COEFFS[10] = {0x1.5555555540fa1p-3, 0x1.333333512edc2p-4,
                                     -0x1.df946fa875ddp-8, 0x1.02311ecf99c28p-5};
 
 // Evaluate P(x^2) - 1, where P(x^2) ~ asin(x)/x
-LIBC_INLINE double asin_eval(double xsq) {
+LIBC_INLINE constexpr double asin_eval(double xsq) {
   double x4 = xsq * xsq;
   double r1 = fputil::polyeval(x4, ASIN_COEFFS[0], ASIN_COEFFS[2],
                                ASIN_COEFFS[4], ASIN_COEFFS[6], ASIN_COEFFS[8]);

diff  --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 9d62b1cf7ef730..026bcd12928bdf 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1645,20 +1645,6 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
-add_fp_unittest(
-  inv_trigf_utils_test
-  NEED_MPFR
-  SUITE
-    libc-math-unittests
-  HDRS
-    in_float_range_test_helper.h
-  SRCS
-    inv_trigf_utils_test.cpp
-  DEPENDS
-    libc.src.math.generic.inv_trigf_utils
-    libc.src.__support.FPUtil.fp_bits
-)
-
 add_fp_unittest(
   scalbn_test
   NEED_MPFR

diff  --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
index fdd69d3dc7914f..e51932fc495525 100644
--- a/libc/test/src/math/atanf_test.cpp
+++ b/libc/test/src/math/atanf_test.cpp
@@ -53,8 +53,15 @@ TEST_F(LlvmLibcAtanfTest, InFloatRange) {
 
 // For small values, tanh(x) is x.
 TEST_F(LlvmLibcAtanfTest, SpecialValues) {
-  uint32_t val_arr[] = {0x3d8d6b23U, 0x3feefcfbU, 0xbd8d6b23U,
-                        0xbfeefcfbU, 0x7F800000U, 0xFF800000U};
+  uint32_t val_arr[] = {
+      0x3d8d6b23U, // x = 0x1.1ad646p-4f
+      0x3feefcfbU, // x = 0x1.ddf9f6p+0f
+      0xbd8d6b23U, // x = -0x1.1ad646p-4f
+      0xbfeefcfbU, // x = -0x1.ddf9f6p+0f
+      0x7F800000U, // x = +Inf
+      0xFF800000U, // x = -Inf
+      0xbffe2ec1U, // x = -0x1.fc5d82p+0f
+  };
   for (uint32_t v : val_arr) {
     float x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x,

diff  --git a/libc/test/src/math/exhaustive/atanf_test.cpp b/libc/test/src/math/exhaustive/atanf_test.cpp
index 508c288b050c55..6f23231d426741 100644
--- a/libc/test/src/math/exhaustive/atanf_test.cpp
+++ b/libc/test/src/math/exhaustive/atanf_test.cpp
@@ -25,7 +25,7 @@ TEST_F(LlvmLibcAtanfExhaustiveTest, PostiveRange) {
 }
 
 // Range: [-Inf, 0];
-static constexpr uint32_t NEG_START = 0xb000'0000U;
+static constexpr uint32_t NEG_START = 0x8000'0000U;
 static constexpr uint32_t NEG_STOP = 0xff80'0000U;
 
 TEST_F(LlvmLibcAtanfExhaustiveTest, NegativeRange) {

diff  --git a/libc/test/src/math/inv_trigf_utils_test.cpp b/libc/test/src/math/inv_trigf_utils_test.cpp
deleted file mode 100644
index ebf21765b26d28..00000000000000
--- a/libc/test/src/math/inv_trigf_utils_test.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- Unittests for supfuncf --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "in_float_range_test_helper.h"
-#include "include/llvm-libc-macros/math-macros.h"
-#include "src/__support/FPUtil/FPBits.h"
-#include "src/math/generic/inv_trigf_utils.h"
-#include "test/UnitTest/FPMatcher.h"
-#include "test/UnitTest/Test.h"
-#include "utils/MPFRWrapper/MPFRUtils.h"
-
-using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
-
-namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
-
-constexpr int def_count = 100003;
-constexpr float def_prec = 0.500001f;
-
-auto f_normal = [](float x) -> bool { return !(isnan(x) || isinf(x)); };
-
-TEST_F(LlvmLibcAtanfTest, InPositiveRange) {
-  CHECK_DATA(0.0f, inf, mpfr::Operation::Atan, LIBC_NAMESPACE::atan_eval,
-             f_normal, def_count, def_prec);
-}
-
-TEST_F(LlvmLibcAtanfTest, InNegativeRange) {
-  CHECK_DATA(-0.0f, neg_inf, mpfr::Operation::Atan, LIBC_NAMESPACE::atan_eval,
-             f_normal, def_count, def_prec);
-}

diff  --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 0ec8e350281df9..01bb8b4298cdef 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1252,13 +1252,9 @@ libc_support_library(
     hdrs = ["src/math/generic/inv_trigf_utils.h"],
     deps = [
         ":__support_common",
-        ":__support_fputil_fenv_impl",
         ":__support_fputil_fma",
-        ":__support_fputil_fp_bits",
         ":__support_fputil_multiply_add",
-        ":__support_fputil_nearest_integer",
         ":__support_fputil_polyeval",
-        ":math_utils",
     ],
 )