[libc-commits] [libc] [libc][math][c23] Add f16fmaf C23 math function (PR #95483)

Fri Jun 14 08:38:55 PDT 2024

https://github.com/overmighty updated https://github.com/llvm/llvm-project/pull/95483

>From 0f9a842b067f0ce4868c68219b0364ae8863935a Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 00:57:03 +0200
Subject: [PATCH 01/10] [libc][math][c23] Add f16fmaf C23 math function

---
 libc/config/linux/aarch64/entrypoints.txt   |   1 +
 libc/config/linux/x86_64/entrypoints.txt    |   1 +
 libc/docs/math/index.rst                    |   2 +
 libc/spec/stdc.td                           |   2 +
 libc/src/__support/FPUtil/FMA.h             |  32 ++---
 libc/src/__support/FPUtil/generic/FMA.h     | 145 +++++++++++--------
 libc/src/__support/FPUtil/multiply_add.h    |   4 +-
 libc/src/__support/big_int.h                |  16 +++
 libc/src/math/CMakeLists.txt                |   2 +
 libc/src/math/f16fmaf.h                     |  20 +++
 libc/src/math/generic/CMakeLists.txt        |  12 ++
 libc/src/math/generic/expm1f.cpp            |   2 +-
 libc/src/math/generic/f16fmaf.cpp           |  19 +++
 libc/src/math/generic/fma.cpp               |   2 +-
 libc/src/math/generic/fmaf.cpp              |   2 +-
 libc/src/math/generic/range_reduction_fma.h |  25 ++--
 libc/test/src/math/CMakeLists.txt           |  21 ++-
 libc/test/src/math/FmaTest.h                | 147 ++++++++++++--------
 libc/test/src/math/f16fmaf_test.cpp         |  25 ++++
 libc/test/src/math/smoke/CMakeLists.txt     |  18 ++-
 libc/test/src/math/smoke/FmaTest.h          |  96 ++++++++-----
 libc/test/src/math/smoke/f16fmaf_test.cpp   |  13 ++
 libc/test/src/math/smoke/fma_test.cpp       |   6 +-
 libc/test/src/math/smoke/fmaf_test.cpp      |   6 +-
 libc/test/src/sched/CMakeLists.txt          |  34 ++---
 libc/utils/MPFRWrapper/MPFRUtils.cpp        |  78 ++++++-----
 libc/utils/MPFRWrapper/MPFRUtils.h          |  48 +++++--
 27 files changed, 513 insertions(+), 266 deletions(-)
 create mode 100644 libc/src/math/f16fmaf.h
 create mode 100644 libc/src/math/generic/f16fmaf.cpp
 create mode 100644 libc/test/src/math/f16fmaf_test.cpp
 create mode 100644 libc/test/src/math/smoke/f16fmaf_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 2b2d0985a8992..dab747ca0ac74 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -503,6 +503,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.f16fmaf
     libc.src.math.f16sqrtf
     libc.src.math.fabsf16
     libc.src.math.fdimf16
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 2d36ca296c3a4..45914fe9f7ad2 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -535,6 +535,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.f16fmaf
     libc.src.math.f16sqrtf
     libc.src.math.fabsf16
     libc.src.math.fdimf16
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 790786147c164..293edd1c15100 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -124,6 +124,8 @@ Basic Operations
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | dsub             | N/A              | N/A             |                        | N/A                  |                        | 7.12.14.2              | F.10.11                    |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
+| f16fma           | |check|          |                 |                        | N/A                  |                        | 7.12.14.5              | F.10.11                    |
++------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fabs             | |check|          | |check|         | |check|                | |check|              | |check|                | 7.12.7.3               | F.10.4.3                   |
 +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | fadd             | N/A              |                 |                        | N/A                  |                        | 7.12.14.1              | F.10.11                    |
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 7c4135032a0b2..b089b596b0958 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -715,6 +715,8 @@ def StdC : StandardSpec<"stdc"> {
 
           GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
 
+          GuardedFunctionSpec<"f16fmaf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+
           GuardedFunctionSpec<"f16sqrtf", RetValSpec<Float16Type>, [ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
       ]
   >;
diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
index c277da49538bf..cf01a317d7359 100644
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -10,41 +10,29 @@
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_FMA_H
 
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/FPUtil/generic/FMA.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
-
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
-  return __builtin_fmaf(x, y, z);
+template <typename OutType, typename InType>
+LIBC_INLINE OutType fma(InType x, InType y, InType z) {
+  return generic::fma<OutType>(x, y, z);
 }
 
-template <typename T>
-LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
-  return __builtin_fma(x, y, z);
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+template <> LIBC_INLINE float fma(float x, float y, float z) {
+  return __builtin_fmaf(x, y, z);
 }
 
-} // namespace fputil
-} // namespace LIBC_NAMESPACE
-
-#else
-// FMA instructions are not available
-#include "generic/FMA.h"
-
-namespace LIBC_NAMESPACE {
-namespace fputil {
-
-template <typename T> LIBC_INLINE T fma(T x, T y, T z) {
-  return generic::fma(x, y, z);
+template <> LIBC_INLINE double fma(double x, double y, double z) {
+  return __builtin_fma(x, y, z);
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
 
-#endif
-
 #endif // LLVM_LIBC_SRC___SUPPORT_FPUTIL_FMA_H
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index f403aa7333b39..3bbb30476e5d8 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -10,20 +10,28 @@
 #define LLVM_LIBC_SRC___SUPPORT_FPUTIL_GENERIC_FMA_H
 
 #include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/FPUtil/FEnvImpl.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/big_int.h"
 #include "src/__support/macros/attributes.h"   // LIBC_INLINE
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/uint128.h"
+
+#include "hdr/fenv_macros.h"
 
 namespace LIBC_NAMESPACE {
 namespace fputil {
 namespace generic {
 
-template <typename T> LIBC_INLINE T fma(T x, T y, T z);
+template <typename OutType, typename InType>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
+                                 cpp::is_floating_point_v<InType> &&
+                                 sizeof(OutType) <= sizeof(InType),
+                             OutType>
+fma(InType x, InType y, InType z);
 
+#ifndef LIBC_TARGET_CPU_HAS_FMA
 // TODO(lntue): Implement fmaf that is correctly rounded to all rounding modes.
 // The implementation below only is only correct for the default rounding mode,
 // round-to-nearest tie-to-even.
@@ -74,17 +82,20 @@ template <> LIBC_INLINE float fma<float>(float x, float y, float z) {
 
   return static_cast<float>(bit_sum.get_val());
 }
+#endif // LIBC_TARGET_CPU_HAS_FMA
 
 namespace internal {
 
 // Extract the sticky bits and shift the `mantissa` to the right by
 // `shift_length`.
-LIBC_INLINE bool shift_mantissa(int shift_length, UInt128 &mant) {
-  if (shift_length >= 128) {
+template <typename T>
+LIBC_INLINE cpp::enable_if_t<is_unsigned_integral_or_big_int_v<T>, bool>
+shift_mantissa(int shift_length, T &mant) {
+  if (shift_length >= cpp::numeric_limits<T>::digits) {
     mant = 0;
     return true; // prod_mant is non-zero.
   }
-  UInt128 mask = (UInt128(1) << shift_length) - 1;
+  T mask = (T(1) << shift_length) - 1;
   bool sticky_bits = (mant & mask) != 0;
   mant >>= shift_length;
   return sticky_bits;
@@ -92,11 +103,29 @@ LIBC_INLINE bool shift_mantissa(int shift_length, UInt128 &mant) {
 
 } // namespace internal
 
-template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
-  using FPBits = fputil::FPBits<double>;
+template <typename OutType, typename InType>
+LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
+                                 cpp::is_floating_point_v<InType> &&
+                                 sizeof(OutType) <= sizeof(InType),
+                             OutType>
+fma(InType x, InType y, InType z) {
+  using OutFPBits = fputil::FPBits<OutType>;
+  using OutStorageType = typename OutFPBits::StorageType;
+  using InFPBits = fputil::FPBits<InType>;
+  using InStorageType = typename InFPBits::StorageType;
+
+  constexpr int IN_EXPLICIT_MANT_LEN = InFPBits::FRACTION_LEN + 1;
+  constexpr size_t PROD_LEN = 2 * (IN_EXPLICIT_MANT_LEN);
+  constexpr size_t TMP_RESULT_LEN = cpp::bit_ceil(PROD_LEN + 1);
+  using TmpResultType = UInt<TMP_RESULT_LEN>;
+
+  constexpr size_t EXTRA_FRACTION_LEN =
+      TMP_RESULT_LEN - 1 - OutFPBits::FRACTION_LEN;
+  constexpr TmpResultType EXTRA_FRACTION_STICKY_MASK =
+      (TmpResultType(1) << (EXTRA_FRACTION_LEN - 1)) - 1;
 
   if (LIBC_UNLIKELY(x == 0 || y == 0 || z == 0)) {
-    return x * y + z;
+    return static_cast<OutType>(x * y + z);
   }
 
   int x_exp = 0;
@@ -104,35 +133,35 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
   int z_exp = 0;
 
   // Normalize denormal inputs.
-  if (LIBC_UNLIKELY(FPBits(x).is_subnormal())) {
-    x_exp -= 52;
-    x *= 0x1.0p+52;
+  if (LIBC_UNLIKELY(InFPBits(x).is_subnormal())) {
+    x_exp -= InFPBits::FRACTION_LEN;
+    x *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
   }
-  if (LIBC_UNLIKELY(FPBits(y).is_subnormal())) {
-    y_exp -= 52;
-    y *= 0x1.0p+52;
+  if (LIBC_UNLIKELY(InFPBits(y).is_subnormal())) {
+    y_exp -= InFPBits::FRACTION_LEN;
+    y *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
   }
-  if (LIBC_UNLIKELY(FPBits(z).is_subnormal())) {
-    z_exp -= 52;
-    z *= 0x1.0p+52;
+  if (LIBC_UNLIKELY(InFPBits(z).is_subnormal())) {
+    z_exp -= InFPBits::FRACTION_LEN;
+    z *= InType(InStorageType(1) << InFPBits::FRACTION_LEN);
   }
 
-  FPBits x_bits(x), y_bits(y), z_bits(z);
+  InFPBits x_bits(x), y_bits(y), z_bits(z);
   const Sign z_sign = z_bits.sign();
   Sign prod_sign = (x_bits.sign() == y_bits.sign()) ? Sign::POS : Sign::NEG;
   x_exp += x_bits.get_biased_exponent();
   y_exp += y_bits.get_biased_exponent();
   z_exp += z_bits.get_biased_exponent();
 
-  if (LIBC_UNLIKELY(x_exp == FPBits::MAX_BIASED_EXPONENT ||
-                    y_exp == FPBits::MAX_BIASED_EXPONENT ||
-                    z_exp == FPBits::MAX_BIASED_EXPONENT))
-    return x * y + z;
+  if (LIBC_UNLIKELY(x_exp == InFPBits::MAX_BIASED_EXPONENT ||
+                    y_exp == InFPBits::MAX_BIASED_EXPONENT ||
+                    z_exp == InFPBits::MAX_BIASED_EXPONENT))
+    return static_cast<OutType>(x * y + z);
 
   // Extract mantissa and append hidden leading bits.
-  UInt128 x_mant = x_bits.get_explicit_mantissa();
-  UInt128 y_mant = y_bits.get_explicit_mantissa();
-  UInt128 z_mant = z_bits.get_explicit_mantissa();
+  InStorageType x_mant = x_bits.get_explicit_mantissa();
+  InStorageType y_mant = y_bits.get_explicit_mantissa();
+  TmpResultType z_mant = z_bits.get_explicit_mantissa();
 
   // If the exponent of the product x*y > the exponent of z, then no extra
   // precision beside the entire product x*y is needed.  On the other hand, when
@@ -144,21 +173,24 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
   // - prod :     1bb...bb....b
   // In that case, in order to store the exact result, we need at least
   //   (Length of prod) - (MantissaLength of z) = 2*(52 + 1) - 52 = 54.
+  // TODO:                            53? (Explicit mantissa.) ^
   // Overall, before aligning the mantissas and exponents, we can simply left-
   // shift the mantissa of z by at least 54, and left-shift the product of x*y
   // by (that amount - 52).  After that, it is enough to align the least
+  // TODO:             ^ 54?
   // significant bit, given that we keep track of the round and sticky bits
   // after the least significant bit.
   // We pick shifting z_mant by 64 bits so that technically we can simply use
   // the original mantissa as high part when constructing 128-bit z_mant. So the
   // mantissa of prod will be left-shifted by 64 - 54 = 10 initially.
 
-  UInt128 prod_mant = x_mant * y_mant << 10;
+  TmpResultType prod_mant = TmpResultType(x_mant) * y_mant;
   int prod_lsb_exp =
-      x_exp + y_exp - (FPBits::EXP_BIAS + 2 * FPBits::FRACTION_LEN + 10);
+      x_exp + y_exp - (InFPBits::EXP_BIAS + 2 * InFPBits::FRACTION_LEN);
 
-  z_mant <<= 64;
-  int z_lsb_exp = z_exp - (FPBits::FRACTION_LEN + 64);
+  constexpr int RESULT_MIN_LEN = PROD_LEN - InFPBits::FRACTION_LEN;
+  z_mant <<= RESULT_MIN_LEN;
+  int z_lsb_exp = z_exp - (InFPBits::FRACTION_LEN + RESULT_MIN_LEN);
   bool round_bit = false;
   bool sticky_bits = false;
   bool z_shifted = false;
@@ -198,46 +230,40 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
     }
   }
 
-  uint64_t result = 0;
+  OutStorageType result = 0;
   int r_exp = 0; // Unbiased exponent of the result
 
   // Normalize the result.
   if (prod_mant != 0) {
-    uint64_t prod_hi = static_cast<uint64_t>(prod_mant >> 64);
-    int lead_zeros =
-        prod_hi ? cpp::countl_zero(prod_hi)
-                : 64 + cpp::countl_zero(static_cast<uint64_t>(prod_mant));
+    int lead_zeros = cpp::countl_zero(prod_mant);
     // Move the leading 1 to the most significant bit.
     prod_mant <<= lead_zeros;
-    // The lower 64 bits are always sticky bits after moving the leading 1 to
-    // the most significant bit.
-    sticky_bits |= (static_cast<uint64_t>(prod_mant) != 0);
-    result = static_cast<uint64_t>(prod_mant >> 64);
-    // Change prod_lsb_exp the be the exponent of the least significant bit of
-    // the result.
-    prod_lsb_exp += 64 - lead_zeros;
-    r_exp = prod_lsb_exp + 63;
+    prod_lsb_exp -= lead_zeros;
+    r_exp = prod_lsb_exp + (cpp::numeric_limits<TmpResultType>::digits - 1) -
+            InFPBits::EXP_BIAS + OutFPBits::EXP_BIAS;
 
     if (r_exp > 0) {
       // The result is normal.  We will shift the mantissa to the right by
       // 63 - 52 = 11 bits (from the locations of the most significant bit).
       // Then the rounding bit will correspond the 11th bit, and the lowest
       // 10 bits are merged into sticky bits.
-      round_bit = (result & 0x0400ULL) != 0;
-      sticky_bits |= (result & 0x03ffULL) != 0;
-      result >>= 11;
+      round_bit =
+          (prod_mant & (TmpResultType(1) << (EXTRA_FRACTION_LEN - 1))) != 0;
+      sticky_bits |= (prod_mant & EXTRA_FRACTION_STICKY_MASK) != 0;
+      result = static_cast<OutStorageType>(prod_mant >> EXTRA_FRACTION_LEN);
     } else {
-      if (r_exp < -52) {
+      if (r_exp < -OutFPBits::FRACTION_LEN) {
         // The result is smaller than 1/2 of the smallest denormal number.
         sticky_bits = true; // since the result is non-zero.
         result = 0;
       } else {
         // The result is denormal.
-        uint64_t mask = 1ULL << (11 - r_exp);
-        round_bit = (result & mask) != 0;
-        sticky_bits |= (result & (mask - 1)) != 0;
-        if (r_exp > -52)
-          result >>= 12 - r_exp;
+        TmpResultType mask = TmpResultType(1) << (EXTRA_FRACTION_LEN - r_exp);
+        round_bit = (prod_mant & mask) != 0;
+        sticky_bits |= (prod_mant & (mask - 1)) != 0;
+        if (r_exp > -OutFPBits::FRACTION_LEN)
+          result = static_cast<OutStorageType>(
+              prod_mant >> (EXTRA_FRACTION_LEN + 1 - r_exp));
         else
           result = 0;
       }
@@ -251,20 +277,21 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
 
   // Finalize the result.
   int round_mode = fputil::quick_get_round();
-  if (LIBC_UNLIKELY(r_exp >= FPBits::MAX_BIASED_EXPONENT)) {
+  if (LIBC_UNLIKELY(r_exp >= OutFPBits::MAX_BIASED_EXPONENT)) {
     if ((round_mode == FE_TOWARDZERO) ||
         (round_mode == FE_UPWARD && prod_sign.is_neg()) ||
         (round_mode == FE_DOWNWARD && prod_sign.is_pos())) {
-      return FPBits::max_normal(prod_sign).get_val();
+      return OutFPBits::max_normal(prod_sign).get_val();
     }
-    return FPBits::inf(prod_sign).get_val();
+    return OutFPBits::inf(prod_sign).get_val();
   }
 
   // Remove hidden bit and append the exponent field and sign bit.
-  result = (result & FPBits::FRACTION_MASK) |
-           (static_cast<uint64_t>(r_exp) << FPBits::FRACTION_LEN);
+  result = static_cast<OutStorageType>(
+      (result & OutFPBits::FRACTION_MASK) |
+      (static_cast<OutStorageType>(r_exp) << OutFPBits::FRACTION_LEN));
   if (prod_sign.is_neg()) {
-    result |= FPBits::SIGN_MASK;
+    result |= OutFPBits::SIGN_MASK;
   }
 
   // Rounding.
@@ -277,7 +304,7 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
       ++result;
   }
 
-  return cpp::bit_cast<double>(result);
+  return cpp::bit_cast<OutType>(result);
 }
 
 } // namespace generic
diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
index 82932da5417c8..622914e4265c9 100644
--- a/libc/src/__support/FPUtil/multiply_add.h
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -45,11 +45,11 @@ namespace LIBC_NAMESPACE {
 namespace fputil {
 
 LIBC_INLINE float multiply_add(float x, float y, float z) {
-  return fma(x, y, z);
+  return fma<float>(x, y, z);
 }
 
 LIBC_INLINE double multiply_add(double x, double y, double z) {
-  return fma(x, y, z);
+  return fma<double>(x, y, z);
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index 40ad6eeed7ac2..27cb3b783470c 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -983,6 +983,12 @@ using UInt = BigInt<Bits, false, internal::WordTypeSelectorT<Bits>>;
 template <size_t Bits>
 using Int = BigInt<Bits, true, internal::WordTypeSelectorT<Bits>>;
 
+// Provides limits of BigInt.
+template <size_t Bits, bool Signed, typename T>
+struct cpp::numeric_limits<BigInt<Bits, Signed, T>> {
+  LIBC_INLINE_VAR static constexpr int digits = Bits - Signed;
+};
+
 // Provides limits of U/Int<128>.
 template <> class cpp::numeric_limits<UInt<128>> {
 public:
@@ -1073,6 +1079,16 @@ template <typename T>
 using make_integral_or_big_int_signed_t =
     typename make_integral_or_big_int_signed<T>::type;
 
+// is_unsigned_integral_or_big_int
+template <typename T>
+struct is_unsigned_integral_or_big_int
+    : cpp::bool_constant<
+          cpp::is_same_v<T, make_integral_or_big_int_unsigned_t<T>>> {};
+
+template <typename T>
+LIBC_INLINE_VAR constexpr bool is_unsigned_integral_or_big_int_v =
+    is_unsigned_integral_or_big_int<T>::value;
+
 namespace cpp {
 
 // Specialization of cpp::bit_cast ('bit.h') from T to BigInt.
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index df8e6c0b253da..4472367d6c073 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -99,6 +99,8 @@ add_math_entrypoint_object(exp10f)
 add_math_entrypoint_object(expm1)
 add_math_entrypoint_object(expm1f)
 
+add_math_entrypoint_object(f16fmaf)
+
 add_math_entrypoint_object(f16sqrtf)
 
 add_math_entrypoint_object(fabs)
diff --git a/libc/src/math/f16fmaf.h b/libc/src/math/f16fmaf.h
new file mode 100644
index 0000000000000..d92cb43c292eb
--- /dev/null
+++ b/libc/src/math/f16fmaf.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for f16fmaf -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_F16FMAF_H
+#define LLVM_LIBC_SRC_MATH_F16FMAF_H
+
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE {
+
+float16 f16fmaf(float x, float y, float z);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_F16FMAF_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index f1f7d6c367be2..706bfc4b08670 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3602,6 +3602,18 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  f16fmaf
+  SRCS
+    f16fmaf.cpp
+  HDRS
+    ../f16fmaf.h
+  DEPENDS
+    libc.src.__support.FPUtil.fma
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   f16sqrtf
   SRCS
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index 037e60021b296..6b9f07476a650 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -104,7 +104,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
         // intermediate results as it is more efficient than using an emulated
         // version of FMA.
 #if defined(LIBC_TARGET_CPU_HAS_FMA)
-      return fputil::fma(x, x, x);
+      return fputil::fma<float>(x, x, x);
 #else
       double xd = x;
       return static_cast<float>(fputil::multiply_add(xd, xd, xd));
diff --git a/libc/src/math/generic/f16fmaf.cpp b/libc/src/math/generic/f16fmaf.cpp
new file mode 100644
index 0000000000000..09f2712639335
--- /dev/null
+++ b/libc/src/math/generic/f16fmaf.cpp
@@ -0,0 +1,19 @@
+//===-- Implementation of f16fmaf function --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/f16fmaf.h"
+#include "src/__support/FPUtil/FMA.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float16, f16fmaf, (float x, float y, float z)) {
+  return fputil::fma<float16>(x, y, z);
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/fma.cpp b/libc/src/math/generic/fma.cpp
index e27e5baeddf58..7937766dccd71 100644
--- a/libc/src/math/generic/fma.cpp
+++ b/libc/src/math/generic/fma.cpp
@@ -14,7 +14,7 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(double, fma, (double x, double y, double z)) {
-  return fputil::fma(x, y, z);
+  return fputil::fma<double>(x, y, z);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/fmaf.cpp b/libc/src/math/generic/fmaf.cpp
index 7512b82005d0f..d367a069ea7d8 100644
--- a/libc/src/math/generic/fmaf.cpp
+++ b/libc/src/math/generic/fmaf.cpp
@@ -14,7 +14,7 @@
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, fmaf, (float x, float y, float z)) {
-  return fputil::fma(x, y, z);
+  return fputil::fma<float>(x, y, z);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/range_reduction_fma.h b/libc/src/math/generic/range_reduction_fma.h
index aee8cbb1332a6..82b4ae1c705e1 100644
--- a/libc/src/math/generic/range_reduction_fma.h
+++ b/libc/src/math/generic/range_reduction_fma.h
@@ -33,8 +33,8 @@ static constexpr double THIRTYTWO_OVER_PI[5] = {
 //   k = round(x * 32 / pi) and y = (x * 32 / pi) - k.
 LIBC_INLINE int64_t small_range_reduction(double x, double &y) {
   double kd = fputil::nearest_integer(x * THIRTYTWO_OVER_PI[0]);
-  y = fputil::fma(x, THIRTYTWO_OVER_PI[0], -kd);
-  y = fputil::fma(x, THIRTYTWO_OVER_PI[1], y);
+  y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[0], -kd);
+  y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[1], y);
   return static_cast<int64_t>(kd);
 }
 
@@ -54,12 +54,13 @@ LIBC_INLINE int64_t large_range_reduction(double x, int x_exp, double &y) {
     prod_hi.set_uintval(prod_hi.uintval() &
                         ((x_exp < 55) ? (~0xfffULL) : (~0ULL))); // |x| < 2^55
     double k_hi = fputil::nearest_integer(prod_hi.get_val());
-    double truncated_prod = fputil::fma(x, THIRTYTWO_OVER_PI[0], -k_hi);
-    double prod_lo = fputil::fma(x, THIRTYTWO_OVER_PI[1], truncated_prod);
+    double truncated_prod = fputil::fma<double>(x, THIRTYTWO_OVER_PI[0], -k_hi);
+    double prod_lo =
+        fputil::fma<double>(x, THIRTYTWO_OVER_PI[1], truncated_prod);
     double k_lo = fputil::nearest_integer(prod_lo);
-    y = fputil::fma(x, THIRTYTWO_OVER_PI[1], truncated_prod - k_lo);
-    y = fputil::fma(x, THIRTYTWO_OVER_PI[2], y);
-    y = fputil::fma(x, THIRTYTWO_OVER_PI[3], y);
+    y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[1], truncated_prod - k_lo);
+    y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[2], y);
+    y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[3], y);
 
     return static_cast<int64_t>(k_lo);
   }
@@ -74,12 +75,12 @@ LIBC_INLINE int64_t large_range_reduction(double x, int x_exp, double &y) {
   prod_hi.set_uintval(prod_hi.uintval() &
                       ((x_exp < 110) ? (~0xfffULL) : (~0ULL))); // |x| < 2^110
   double k_hi = fputil::nearest_integer(prod_hi.get_val());
-  double truncated_prod = fputil::fma(x, THIRTYTWO_OVER_PI[1], -k_hi);
-  double prod_lo = fputil::fma(x, THIRTYTWO_OVER_PI[2], truncated_prod);
+  double truncated_prod = fputil::fma<double>(x, THIRTYTWO_OVER_PI[1], -k_hi);
+  double prod_lo = fputil::fma<double>(x, THIRTYTWO_OVER_PI[2], truncated_prod);
   double k_lo = fputil::nearest_integer(prod_lo);
-  y = fputil::fma(x, THIRTYTWO_OVER_PI[2], truncated_prod - k_lo);
-  y = fputil::fma(x, THIRTYTWO_OVER_PI[3], y);
-  y = fputil::fma(x, THIRTYTWO_OVER_PI[4], y);
+  y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[2], truncated_prod - k_lo);
+  y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[3], y);
+  y = fputil::fma<double>(x, THIRTYTWO_OVER_PI[4], y);
 
   return static_cast<int64_t>(k_lo);
 }
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 79e6e89a5324e..bb364c3f0a175 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1455,11 +1455,12 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     fmaf_test.cpp
+  HDRS
+    FmaTest.h
   DEPENDS
     libc.src.math.fmaf
     libc.src.stdlib.rand
     libc.src.stdlib.srand
-    libc.src.__support.FPUtil.fp_bits
   FLAGS
     FMA_OPT__ONLY
 )
@@ -1471,11 +1472,12 @@ add_fp_unittest(
     libc-math-unittests
   SRCS
     fma_test.cpp
+  HDRS
+    FmaTest.h
   DEPENDS
     libc.src.math.fma
     libc.src.stdlib.rand
     libc.src.stdlib.srand
-    libc.src.__support.FPUtil.fp_bits
 )
 
 add_fp_unittest(
@@ -1888,6 +1890,21 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  f16fmaf_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    f16fmaf_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.f16fmaf
+    libc.src.stdlib.rand
+    libc.src.stdlib.srand
+)
+
 add_subdirectory(generic)
 add_subdirectory(smoke)
 
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 5a40f694ebd10..0e0f05aad5d13 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_FMATEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_FMATEST_H
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "src/stdlib/rand.h"
 #include "src/stdlib/srand.h"
 #include "test/UnitTest/FEnvSafeTest.h"
@@ -19,85 +18,115 @@
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
-template <typename T>
+template <typename OutType, typename InType = OutType>
 class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
-private:
-  using Func = T (*)(T, T, T);
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-  using StorageType = typename FPBits::StorageType;
-
-  const T min_subnormal = FPBits::min_subnormal(Sign::POS).get_val();
-  const T min_normal = FPBits::min_normal(Sign::POS).get_val();
-  const T max_normal = FPBits::max_normal(Sign::POS).get_val();
-  const T inf = FPBits::inf(Sign::POS).get_val();
-  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
-  const T zero = FPBits::zero(Sign::POS).get_val();
-  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
-  const T nan = FPBits::quiet_nan().get_val();
-
-  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
-  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-  static constexpr StorageType MAX_SUBNORMAL =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType MIN_SUBNORMAL =
-      FPBits::min_subnormal().uintval();
-
-  StorageType get_random_bit_pattern() {
-    StorageType bits{0};
-    for (StorageType i = 0; i < sizeof(StorageType) / 2; ++i) {
+
+  struct OutConstants {
+    DECLARE_SPECIAL_CONSTANTS(OutType)
+  };
+
+  struct InConstants {
+    DECLARE_SPECIAL_CONSTANTS(InType)
+  };
+
+  using OutFPBits = typename OutConstants::FPBits;
+  using OutStorageType = typename OutConstants::StorageType;
+  using InFPBits = typename InConstants::FPBits;
+  using InStorageType = typename InConstants::StorageType;
+
+  static constexpr OutStorageType OUT_MIN_NORMAL_U =
+      OutFPBits::min_normal().uintval();
+  static constexpr InStorageType IN_MAX_NORMAL_U =
+      InFPBits::max_normal().uintval();
+  static constexpr InStorageType IN_MIN_NORMAL_U =
+      InFPBits::min_normal().uintval();
+  static constexpr InStorageType IN_MAX_SUBNORMAL_U =
+      InFPBits::max_subnormal().uintval();
+  static constexpr InStorageType IN_MIN_SUBNORMAL_U =
+      InFPBits::min_subnormal().uintval();
+
+  OutConstants out;
+  InConstants in;
+
+  InStorageType get_random_bit_pattern() {
+    InStorageType bits{0};
+    for (InStorageType i = 0; i < sizeof(InStorageType) / 2; ++i) {
       bits = (bits << 2) + static_cast<uint16_t>(LIBC_NAMESPACE::rand());
     }
     return bits;
   }
 
 public:
-  void test_special_numbers(Func func) {
-    EXPECT_FP_EQ(func(zero, zero, zero), zero);
-    EXPECT_FP_EQ(func(zero, neg_zero, neg_zero), neg_zero);
-    EXPECT_FP_EQ(func(inf, inf, zero), inf);
-    EXPECT_FP_EQ(func(neg_inf, inf, neg_inf), neg_inf);
-    EXPECT_FP_EQ(func(inf, zero, zero), nan);
-    EXPECT_FP_EQ(func(inf, neg_inf, inf), nan);
-    EXPECT_FP_EQ(func(nan, zero, inf), nan);
-    EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
+  using FmaFunc = OutType (*)(InType, InType, InType);
+
+  void test_special_numbers(FmaFunc func) {
+    EXPECT_FP_EQ(out.zero, func(in.zero, in.zero, in.zero));
+    EXPECT_FP_EQ(out.neg_zero, func(in.zero, in.neg_zero, in.neg_zero));
+    EXPECT_FP_EQ(out.inf, func(in.inf, in.inf, in.zero));
+    EXPECT_FP_EQ(out.neg_inf, func(in.neg_inf, in.inf, in.neg_inf));
+    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.zero, in.zero));
+    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.neg_inf, in.inf));
+    EXPECT_FP_EQ(out.aNaN, func(in.aNaN, in.zero, in.inf));
+    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.neg_inf, in.aNaN));
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(func(T(0.5), min_subnormal, min_subnormal),
-                 FPBits(StorageType(2)).get_val());
+    EXPECT_FP_EQ(OutFPBits(OutStorageType(2)).get_val(),
+                 func(OutType(0.5), out.min_denormal, out.min_denormal));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      EXPECT_FP_EQ(out.zero,
+                   func(InType(0.5), in.min_denormal, in.min_denormal));
+    }
     // Test underflow rounding down.
-    T v = FPBits(MIN_NORMAL + StorageType(1)).get_val();
-    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, min_normal), v);
+    OutType v = OutFPBits(static_cast<OutStorageType>(OUT_MIN_NORMAL_U +
+                                                      OutStorageType(1)))
+                    .get_val();
+    EXPECT_FP_EQ(v, func(OutType(1) / OutType(OUT_MIN_NORMAL_U << 1), v,
+                         out.min_normal));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      InType v = InFPBits(static_cast<InStorageType>(IN_MIN_NORMAL_U +
+                                                     InStorageType(1)))
+                     .get_val();
+      EXPECT_FP_EQ(
+          out.min_normal,
+          func(InType(1) / InType(IN_MIN_NORMAL_U << 1), v, out.min_normal));
+    }
     // Test overflow.
-    T z = max_normal;
-    EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
+    OutType z = out.max_normal;
+    EXPECT_FP_EQ(OutType(0.75) * z, func(InType(1.75), z, -z));
     // Exact cancellation.
-    EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
-    EXPECT_FP_EQ(func(T(-3.0), T(5.0), T(15.0)), T(0.0));
+    EXPECT_FP_EQ(out.zero, func(InType(3.0), InType(5.0), -InType(15.0)));
+    EXPECT_FP_EQ(out.zero, func(InType(-3.0), InType(5.0), InType(15.0)));
   }
 
-  void test_subnormal_range(Func func) {
-    constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+  void test_subnormal_range(FmaFunc func) {
+    constexpr InStorageType COUNT = 100'001;
+    constexpr InStorageType STEP =
+        (IN_MAX_SUBNORMAL_U - IN_MIN_SUBNORMAL_U) / COUNT;
     LIBC_NAMESPACE::srand(1);
-    for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
-         v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
-      T x = FPBits(get_random_bit_pattern()).get_val(), y = FPBits(v).get_val(),
-        z = FPBits(w).get_val();
-      mpfr::TernaryInput<T> input{x, y, z};
+    for (InStorageType v = IN_MIN_SUBNORMAL_U, w = IN_MAX_SUBNORMAL_U;
+         v <= IN_MAX_SUBNORMAL_U && w >= IN_MIN_SUBNORMAL_U;
+         v += STEP, w -= STEP) {
+      InType x = InFPBits(get_random_bit_pattern()).get_val();
+      InType y = InFPBits(v).get_val();
+      InType z = InFPBits(w).get_val();
+      mpfr::TernaryInput<InType> input{x, y, z};
       ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fma, input, func(x, y, z),
                                      0.5);
     }
   }
 
-  void test_normal_range(Func func) {
-    constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+  void test_normal_range(FmaFunc func) {
+    constexpr InStorageType COUNT = 100'001;
+    constexpr InStorageType STEP = (IN_MAX_NORMAL_U - IN_MIN_NORMAL_U) / COUNT;
     LIBC_NAMESPACE::srand(1);
-    for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
-         v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
-      T x = FPBits(v).get_val(), y = FPBits(w).get_val(),
-        z = FPBits(get_random_bit_pattern()).get_val();
-      mpfr::TernaryInput<T> input{x, y, z};
+    for (InStorageType v = IN_MIN_NORMAL_U, w = IN_MAX_NORMAL_U;
+         v <= IN_MAX_NORMAL_U && w >= IN_MIN_NORMAL_U; v += STEP, w -= STEP) {
+      InType x = InFPBits(v).get_val();
+      InType y = InFPBits(w).get_val();
+      InType z = InFPBits(get_random_bit_pattern()).get_val();
+      mpfr::TernaryInput<InType> input{x, y, z};
       ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fma, input, func(x, y, z),
                                      0.5);
     }
diff --git a/libc/test/src/math/f16fmaf_test.cpp b/libc/test/src/math/f16fmaf_test.cpp
new file mode 100644
index 0000000000000..dc197e297c11e
--- /dev/null
+++ b/libc/test/src/math/f16fmaf_test.cpp
@@ -0,0 +1,25 @@
+//===-- Unittests for f16fmaf ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/f16fmaf.h"
+
+using LlvmLibcF16fmafTest = FmaTestTemplate<float16, float>;
+
+TEST_F(LlvmLibcF16fmafTest, SpecialNumbers) {
+  test_special_numbers(&LIBC_NAMESPACE::f16fmaf);
+}
+
+TEST_F(LlvmLibcF16fmafTest, SubnormalRange) {
+  test_subnormal_range(&LIBC_NAMESPACE::f16fmaf);
+}
+
+TEST_F(LlvmLibcF16fmafTest, NormalRange) {
+  test_normal_range(&LIBC_NAMESPACE::f16fmaf);
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 3e9edc51b004f..a67d0437592d5 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3109,9 +3109,10 @@ add_fp_unittest(
     libc-math-smoke-tests
   SRCS
     fmaf_test.cpp
+  HDRS
+    FmaTest.h
   DEPENDS
     libc.src.math.fmaf
-    libc.src.__support.FPUtil.fp_bits
   FLAGS
     FMA_OPT__ONLY
 )
@@ -3122,9 +3123,10 @@ add_fp_unittest(
     libc-math-smoke-tests
   SRCS
     fma_test.cpp
+  HDRS
+    FmaTest.h
   DEPENDS
     libc.src.math.fma
-    libc.src.__support.FPUtil.fp_bits
 )
 
 add_fp_unittest(
@@ -3551,6 +3553,18 @@ add_fp_unittest(
     libc.src.math.totalordermagf16
 )
 
+add_fp_unittest(
+  f16fmaf_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    f16fmaf_test.cpp
+  HDRS
+    FmaTest.h
+  DEPENDS
+    libc.src.math.f16fmaf
+)
+
 add_fp_unittest(
   f16sqrtf_test
   SUITE
diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h
index 7063ecf199837..3edd3aceb4847 100644
--- a/libc/test/src/math/smoke/FmaTest.h
+++ b/libc/test/src/math/smoke/FmaTest.h
@@ -9,51 +9,85 @@
 #ifndef LLVM_LIBC_TEST_SRC_MATH_FMATEST_H
 #define LLVM_LIBC_TEST_SRC_MATH_FMATEST_H
 
-#include "src/__support/FPUtil/FPBits.h"
 #include "test/UnitTest/FEnvSafeTest.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 
-template <typename T>
+template <typename OutType, typename InType = OutType>
 class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
-private:
-  using Func = T (*)(T, T, T);
-  using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
-  using StorageType = typename FPBits::StorageType;
 
-  const T inf = FPBits::inf(Sign::POS).get_val();
-  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
-  const T zero = FPBits::zero(Sign::POS).get_val();
-  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
-  const T nan = FPBits::quiet_nan().get_val();
+  struct OutConstants {
+    DECLARE_SPECIAL_CONSTANTS(OutType)
+  };
+
+  struct InConstants {
+    DECLARE_SPECIAL_CONSTANTS(InType)
+  };
+
+  using OutFPBits = typename OutConstants::FPBits;
+  using OutStorageType = typename OutConstants::StorageType;
+  using InFPBits = typename InConstants::FPBits;
+  using InStorageType = typename InConstants::StorageType;
+
+  static constexpr OutStorageType OUT_MIN_NORMAL_U =
+      OutFPBits::min_normal().uintval();
+  static constexpr InStorageType IN_MIN_NORMAL_U =
+      InFPBits::min_normal().uintval();
+
+  OutConstants out;
+  InConstants in;
 
 public:
-  void test_special_numbers(Func func) {
-    EXPECT_FP_EQ(func(zero, zero, zero), zero);
-    EXPECT_FP_EQ(func(zero, neg_zero, neg_zero), neg_zero);
-    EXPECT_FP_EQ(func(inf, inf, zero), inf);
-    EXPECT_FP_EQ(func(neg_inf, inf, neg_inf), neg_inf);
-    EXPECT_FP_EQ(func(inf, zero, zero), nan);
-    EXPECT_FP_EQ(func(inf, neg_inf, inf), nan);
-    EXPECT_FP_EQ(func(nan, zero, inf), nan);
-    EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
+  using FmaFunc = OutType (*)(InType, InType, InType);
+
+  void test_special_numbers(FmaFunc func) {
+    EXPECT_FP_EQ(out.zero, func(in.zero, in.zero, in.zero));
+    EXPECT_FP_EQ(out.neg_zero, func(in.zero, in.neg_zero, in.neg_zero));
+    EXPECT_FP_EQ(out.inf, func(in.inf, in.inf, in.zero));
+    EXPECT_FP_EQ(out.neg_inf, func(in.neg_inf, in.inf, in.neg_inf));
+    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.zero, in.zero));
+    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.neg_inf, in.inf));
+    EXPECT_FP_EQ(out.aNaN, func(in.aNaN, in.zero, in.inf));
+    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.neg_inf, in.aNaN));
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(func(T(0.5), FPBits::min_subnormal().get_val(),
-                      FPBits::min_subnormal().get_val()),
-                 FPBits(StorageType(2)).get_val());
+    EXPECT_FP_EQ(OutFPBits(OutStorageType(2)).get_val(),
+                 func(OutType(0.5), out.min_denormal, out.min_denormal));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      EXPECT_FP_EQ(out.zero,
+                   func(InType(0.5), in.min_denormal, in.min_denormal));
+    }
     // Test underflow rounding down.
-    StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-    T v = FPBits(MIN_NORMAL + StorageType(1)).get_val();
-    EXPECT_FP_EQ(
-        func(T(1) / T(MIN_NORMAL << 1), v, FPBits::min_normal().get_val()), v);
+    OutType v = OutFPBits(static_cast<OutStorageType>(OUT_MIN_NORMAL_U +
+                                                      OutStorageType(1)))
+                    .get_val();
+    EXPECT_FP_EQ(v, func(OutType(1) / OutType(OUT_MIN_NORMAL_U << 1), v,
+                         out.min_normal));
+
+    if constexpr (sizeof(OutType) < sizeof(InType)) {
+      InType v = InFPBits(static_cast<InStorageType>(IN_MIN_NORMAL_U +
+                                                     InStorageType(1)))
+                     .get_val();
+      EXPECT_FP_EQ(
+          out.min_normal,
+          func(InType(1) / InType(IN_MIN_NORMAL_U << 1), v, out.min_normal));
+    }
     // Test overflow.
-    T z = FPBits::max_normal().get_val();
-    EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
+    OutType z = out.max_normal;
+    EXPECT_FP_EQ(OutType(0.75) * z, func(InType(1.75), z, -z));
     // Exact cancellation.
-    EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
-    EXPECT_FP_EQ(func(T(-3.0), T(5.0), T(15.0)), T(0.0));
+    EXPECT_FP_EQ(out.zero, func(InType(3.0), InType(5.0), -InType(15.0)));
+    EXPECT_FP_EQ(out.zero, func(InType(-3.0), InType(5.0), InType(15.0)));
   }
 };
 
+#define LIST_FMA_TESTS(T, func)                                                \
+  using LlvmLibcFmaTest = FmaTestTemplate<T>;                                  \
+  TEST_F(LlvmLibcFmaTest, SpecialNumbers) { test_special_numbers(&func); }
+
+#define LIST_NARROWING_FMA_TESTS(OutType, InType, func)                        \
+  using LlvmLibcFmaTest = FmaTestTemplate<OutType, InType>;                    \
+  TEST_F(LlvmLibcFmaTest, SpecialNumbers) { test_special_numbers(&func); }
+
 #endif // LLVM_LIBC_TEST_SRC_MATH_FMATEST_H
diff --git a/libc/test/src/math/smoke/f16fmaf_test.cpp b/libc/test/src/math/smoke/f16fmaf_test.cpp
new file mode 100644
index 0000000000000..5e3aec768c191
--- /dev/null
+++ b/libc/test/src/math/smoke/f16fmaf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for f16fmaf ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FmaTest.h"
+
+#include "src/math/f16fmaf.h"
+
+LIST_NARROWING_FMA_TESTS(float16, float, LIBC_NAMESPACE::f16fmaf)
diff --git a/libc/test/src/math/smoke/fma_test.cpp b/libc/test/src/math/smoke/fma_test.cpp
index 4460b80d9ad65..c5d802a532eb0 100644
--- a/libc/test/src/math/smoke/fma_test.cpp
+++ b/libc/test/src/math/smoke/fma_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/math/fma.h"
 
-using LlvmLibcFmaTest = FmaTestTemplate<double>;
-
-TEST_F(LlvmLibcFmaTest, SpecialNumbers) {
-  test_special_numbers(&LIBC_NAMESPACE::fma);
-}
+LIST_FMA_TESTS(double, LIBC_NAMESPACE::fma)
diff --git a/libc/test/src/math/smoke/fmaf_test.cpp b/libc/test/src/math/smoke/fmaf_test.cpp
index a645efb8776d0..09e9c504b942a 100644
--- a/libc/test/src/math/smoke/fmaf_test.cpp
+++ b/libc/test/src/math/smoke/fmaf_test.cpp
@@ -10,8 +10,4 @@
 
 #include "src/math/fmaf.h"
 
-using LlvmLibcFmafTest = FmaTestTemplate<float>;
-
-TEST_F(LlvmLibcFmafTest, SpecialNumbers) {
-  test_special_numbers(&LIBC_NAMESPACE::fmaf);
-}
+LIST_FMA_TESTS(float, LIBC_NAMESPACE::fmaf)
diff --git a/libc/test/src/sched/CMakeLists.txt b/libc/test/src/sched/CMakeLists.txt
index 9dda4ea16e101..3dd56c72fc86d 100644
--- a/libc/test/src/sched/CMakeLists.txt
+++ b/libc/test/src/sched/CMakeLists.txt
@@ -40,23 +40,23 @@ add_libc_unittest(
     libc.src.sched.sched_get_priority_max
 )
 
-add_libc_unittest(
-  scheduler_test
-  SUITE
-    libc_sched_unittests
-  SRCS
-    param_and_scheduler_test.cpp
-  DEPENDS
-    libc.include.sched
-    libc.src.errno.errno
-    libc.src.sched.sched_getscheduler
-    libc.src.sched.sched_setscheduler
-    libc.src.sched.sched_getparam
-    libc.src.sched.sched_setparam
-    libc.src.sched.sched_get_priority_min
-    libc.src.sched.sched_get_priority_max
-    libc.src.unistd.getuid
-)
+# add_libc_unittest(
+#   scheduler_test
+#   SUITE
+#     libc_sched_unittests
+#   SRCS
+#     param_and_scheduler_test.cpp
+#   DEPENDS
+#     libc.include.sched
+#     libc.src.errno.errno
+#     libc.src.sched.sched_getscheduler
+#     libc.src.sched.sched_setscheduler
+#     libc.src.sched.sched_getparam
+#     libc.src.sched.sched_setparam
+#     libc.src.sched.sched_get_priority_min
+#     libc.src.sched.sched_get_priority_max
+#     libc.src.unistd.getuid
+# )
 
 add_libc_unittest(
   sched_rr_get_interval_test
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 100c6b1644b16..2eac4dd8e199d 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -922,46 +922,49 @@ template void explain_binary_operation_one_output_error<long double>(
     Operation, const BinaryInput<long double> &, long double, double,
     RoundingMode);
 
-template <typename T>
-void explain_ternary_operation_one_output_error(Operation op,
-                                                const TernaryInput<T> &input,
-                                                T libc_result,
-                                                double ulp_tolerance,
-                                                RoundingMode rounding) {
-  unsigned int precision = get_precision<T>(ulp_tolerance);
+template <typename InputType, typename OutputType>
+void explain_ternary_operation_one_output_error(
+    Operation op, const TernaryInput<InputType> &input, OutputType libc_result,
+    double ulp_tolerance, RoundingMode rounding) {
+  unsigned int precision = get_precision<InputType>(ulp_tolerance);
   MPFRNumber mpfrX(input.x, precision);
   MPFRNumber mpfrY(input.y, precision);
   MPFRNumber mpfrZ(input.z, precision);
-  FPBits<T> xbits(input.x);
-  FPBits<T> ybits(input.y);
-  FPBits<T> zbits(input.z);
+  FPBits<InputType> xbits(input.x);
+  FPBits<InputType> ybits(input.y);
+  FPBits<InputType> zbits(input.z);
   MPFRNumber mpfr_result = ternary_operation_one_output(
       op, input.x, input.y, input.z, precision, rounding);
   MPFRNumber mpfrMatchValue(libc_result);
 
   tlog << "Input decimal: x: " << mpfrX.str() << " y: " << mpfrY.str()
        << " z: " << mpfrZ.str() << '\n';
-  tlog << " First input bits: " << str(FPBits<T>(input.x)) << '\n';
-  tlog << "Second input bits: " << str(FPBits<T>(input.y)) << '\n';
-  tlog << " Third input bits: " << str(FPBits<T>(input.z)) << '\n';
+  tlog << " First input bits: " << str(FPBits<InputType>(input.x)) << '\n';
+  tlog << "Second input bits: " << str(FPBits<InputType>(input.y)) << '\n';
+  tlog << " Third input bits: " << str(FPBits<InputType>(input.z)) << '\n';
 
   tlog << "Libc result: " << mpfrMatchValue.str() << '\n'
        << "MPFR result: " << mpfr_result.str() << '\n';
-  tlog << "Libc floating point result bits: " << str(FPBits<T>(libc_result))
-       << '\n';
+  tlog << "Libc floating point result bits: "
+       << str(FPBits<OutputType>(libc_result)) << '\n';
   tlog << "              MPFR rounded bits: "
-       << str(FPBits<T>(mpfr_result.as<T>())) << '\n';
+       << str(FPBits<OutputType>(mpfr_result.as<OutputType>())) << '\n';
   tlog << "ULP error: " << mpfr_result.ulp_as_mpfr_number(libc_result).str()
        << '\n';
 }
 
-template void explain_ternary_operation_one_output_error<float>(
+template void explain_ternary_operation_one_output_error(
     Operation, const TernaryInput<float> &, float, double, RoundingMode);
-template void explain_ternary_operation_one_output_error<double>(
+template void explain_ternary_operation_one_output_error(
     Operation, const TernaryInput<double> &, double, double, RoundingMode);
-template void explain_ternary_operation_one_output_error<long double>(
-    Operation, const TernaryInput<long double> &, long double, double,
-    RoundingMode);
+template void
+explain_ternary_operation_one_output_error(Operation,
+                                           const TernaryInput<long double> &,
+                                           long double, double, RoundingMode);
+#ifdef LIBC_TYPES_HAS_FLOAT16
+template void explain_ternary_operation_one_output_error(
+    Operation, const TernaryInput<float> &, float16, double, RoundingMode);
+#endif
 
 template <typename InputType, typename OutputType>
 bool compare_unary_operation_single_output(Operation op, InputType input,
@@ -1069,12 +1072,13 @@ template bool compare_binary_operation_one_output<long double>(
     Operation, const BinaryInput<long double> &, long double, double,
     RoundingMode);
 
-template <typename T>
+template <typename InputType, typename OutputType>
 bool compare_ternary_operation_one_output(Operation op,
-                                          const TernaryInput<T> &input,
-                                          T libc_result, double ulp_tolerance,
+                                          const TernaryInput<InputType> &input,
+                                          OutputType libc_result,
+                                          double ulp_tolerance,
                                           RoundingMode rounding) {
-  unsigned int precision = get_precision<T>(ulp_tolerance);
+  unsigned int precision = get_precision<InputType>(ulp_tolerance);
   MPFRNumber mpfr_result = ternary_operation_one_output(
       op, input.x, input.y, input.z, precision, rounding);
   double ulp = mpfr_result.ulp(libc_result);
@@ -1082,13 +1086,23 @@ bool compare_ternary_operation_one_output(Operation op,
   return (ulp <= ulp_tolerance);
 }
 
-template bool compare_ternary_operation_one_output<float>(
-    Operation, const TernaryInput<float> &, float, double, RoundingMode);
-template bool compare_ternary_operation_one_output<double>(
-    Operation, const TernaryInput<double> &, double, double, RoundingMode);
-template bool compare_ternary_operation_one_output<long double>(
-    Operation, const TernaryInput<long double> &, long double, double,
-    RoundingMode);
+template bool compare_ternary_operation_one_output(Operation,
+                                                   const TernaryInput<float> &,
+                                                   float, double, RoundingMode);
+template bool compare_ternary_operation_one_output(Operation,
+                                                   const TernaryInput<double> &,
+                                                   double, double,
+                                                   RoundingMode);
+template bool
+compare_ternary_operation_one_output(Operation,
+                                     const TernaryInput<long double> &,
+                                     long double, double, RoundingMode);
+#ifdef LIBC_TYPES_HAS_FLOAT16
+template bool compare_ternary_operation_one_output(Operation,
+                                                   const TernaryInput<float> &,
+                                                   float16, double,
+                                                   RoundingMode);
+#endif
 
 } // namespace internal
 
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 805678b96c2ef..c8c0506164bb2 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -10,6 +10,7 @@
 #define LLVM_LIBC_UTILS_MPFRWRAPPER_MPFRUTILS_H
 
 #include "src/__support/CPP/type_traits.h"
+#include "src/__support/CPP/type_traits/type_identity.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
 
@@ -129,6 +130,18 @@ struct AreMatchingBinaryInputAndBinaryOutput<BinaryInput<T>, BinaryOutput<T>> {
   static constexpr bool VALUE = cpp::is_floating_point_v<T>;
 };
 
+template <typename T> struct IsTernaryInput {
+  static constexpr bool VALUE = false;
+};
+
+template <typename T> struct IsTernaryInput<TernaryInput<T>> {
+  static constexpr bool VALUE = true;
+};
+
+template <typename T> struct MakeScalarInput : cpp::type_identity<T> {};
+template <typename T>
+struct MakeScalarInput<TernaryInput<T>> : cpp::type_identity<T> {};
+
 template <typename InputType, typename OutputType>
 bool compare_unary_operation_single_output(Operation op, InputType input,
                                            OutputType libc_output,
@@ -152,10 +165,11 @@ bool compare_binary_operation_one_output(Operation op,
                                          T libc_output, double ulp_tolerance,
                                          RoundingMode rounding);
 
-template <typename T>
+template <typename InputType, typename OutputType>
 bool compare_ternary_operation_one_output(Operation op,
-                                          const TernaryInput<T> &input,
-                                          T libc_output, double ulp_tolerance,
+                                          const TernaryInput<InputType> &input,
+                                          OutputType libc_output,
+                                          double ulp_tolerance,
                                           RoundingMode rounding);
 
 template <typename InputType, typename OutputType>
@@ -180,12 +194,10 @@ void explain_binary_operation_one_output_error(Operation op,
                                                double ulp_tolerance,
                                                RoundingMode rounding);
 
-template <typename T>
-void explain_ternary_operation_one_output_error(Operation op,
-                                                const TernaryInput<T> &input,
-                                                T match_value,
-                                                double ulp_tolerance,
-                                                RoundingMode rounding);
+template <typename InputType, typename OutputType>
+void explain_ternary_operation_one_output_error(
+    Operation op, const TernaryInput<InputType> &input, OutputType match_value,
+    double ulp_tolerance, RoundingMode rounding);
 
 template <Operation op, bool silent, typename InputType, typename OutputType>
 class MPFRMatcher : public testing::Matcher<OutputType> {
@@ -234,7 +246,8 @@ class MPFRMatcher : public testing::Matcher<OutputType> {
                                                 rounding);
   }
 
-  template <typename T> bool match(const TernaryInput<T> &in, T out) {
+  template <typename T, typename U>
+  bool match(const TernaryInput<T> &in, U out) {
     return compare_ternary_operation_one_output(op, in, out, ulp_tolerance,
                                                 rounding);
   }
@@ -260,7 +273,8 @@ class MPFRMatcher : public testing::Matcher<OutputType> {
                                               rounding);
   }
 
-  template <typename T> void explain_error(const TernaryInput<T> &in, T out) {
+  template <typename T, typename U>
+  void explain_error(const TernaryInput<T> &in, U out) {
     explain_ternary_operation_one_output_error(op, in, out, ulp_tolerance,
                                                rounding);
   }
@@ -272,10 +286,14 @@ class MPFRMatcher : public testing::Matcher<OutputType> {
 // types.
 template <Operation op, typename InputType, typename OutputType>
 constexpr bool is_valid_operation() {
-  constexpr bool IS_NARROWING_OP = op == Operation::Sqrt &&
-                                   cpp::is_floating_point_v<InputType> &&
-                                   cpp::is_floating_point_v<OutputType> &&
-                                   sizeof(OutputType) <= sizeof(InputType);
+  constexpr bool IS_NARROWING_OP =
+      (op == Operation::Sqrt && cpp::is_floating_point_v<InputType> &&
+       cpp::is_floating_point_v<OutputType> &&
+       sizeof(OutputType) <= sizeof(InputType)) ||
+      (op == Operation::Fma && internal::IsTernaryInput<InputType>::VALUE &&
+       cpp::is_floating_point_v<
+           typename internal::MakeScalarInput<InputType>::type> &&
+       cpp::is_floating_point_v<OutputType>);
   if (IS_NARROWING_OP)
     return true;
   return (Operation::BeginUnaryOperationsSingleOutput < op &&

>From 06e598e490065f25788abe3faa33460a0b463c43 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 01:16:03 +0200
Subject: [PATCH 02/10] [libc] Undo disabling of scheduler_test

---
 libc/test/src/sched/CMakeLists.txt | 34 +++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/libc/test/src/sched/CMakeLists.txt b/libc/test/src/sched/CMakeLists.txt
index 3dd56c72fc86d..9dda4ea16e101 100644
--- a/libc/test/src/sched/CMakeLists.txt
+++ b/libc/test/src/sched/CMakeLists.txt
@@ -40,23 +40,23 @@ add_libc_unittest(
     libc.src.sched.sched_get_priority_max
 )
 
-# add_libc_unittest(
-#   scheduler_test
-#   SUITE
-#     libc_sched_unittests
-#   SRCS
-#     param_and_scheduler_test.cpp
-#   DEPENDS
-#     libc.include.sched
-#     libc.src.errno.errno
-#     libc.src.sched.sched_getscheduler
-#     libc.src.sched.sched_setscheduler
-#     libc.src.sched.sched_getparam
-#     libc.src.sched.sched_setparam
-#     libc.src.sched.sched_get_priority_min
-#     libc.src.sched.sched_get_priority_max
-#     libc.src.unistd.getuid
-# )
+add_libc_unittest(
+  scheduler_test
+  SUITE
+    libc_sched_unittests
+  SRCS
+    param_and_scheduler_test.cpp
+  DEPENDS
+    libc.include.sched
+    libc.src.errno.errno
+    libc.src.sched.sched_getscheduler
+    libc.src.sched.sched_setscheduler
+    libc.src.sched.sched_getparam
+    libc.src.sched.sched_setparam
+    libc.src.sched.sched_get_priority_min
+    libc.src.sched.sched_get_priority_max
+    libc.src.unistd.getuid
+)
 
 add_libc_unittest(
   sched_rr_get_interval_test

>From 05bc63cba9427b74f1392a2864d33fe1f97ca133 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 01:18:42 +0200
Subject: [PATCH 03/10] [libc] Fix includes and dependencies

---
 libc/src/__support/FPUtil/generic/CMakeLists.txt | 3 +++
 libc/src/math/generic/CMakeLists.txt             | 1 +
 libc/utils/MPFRWrapper/MPFRUtils.h               | 1 -
 3 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
index 595656e3e8d90..a8a95ba3f15ff 100644
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -19,12 +19,15 @@ add_header_library(
   HDRS
     FMA.h
   DEPENDS
+    libc.hdr.fenv_macros
     libc.src.__support.common
     libc.src.__support.CPP.bit
+    libc.src.__support.CPP.limits
     libc.src.__support.CPP.type_traits
     libc.src.__support.FPUtil.fenv_impl
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.big_int
     libc.src.__support.macros.optimization
     libc.src.__support.uint128
 )
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 706bfc4b08670..aa0069d821d0d 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -3609,6 +3609,7 @@ add_entrypoint_object(
   HDRS
     ../f16fmaf.h
   DEPENDS
+    libc.src.__support.macros.properties.types
     libc.src.__support.FPUtil.fma
   COMPILE_OPTIONS
     -O3
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index c8c0506164bb2..9221b4a457454 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -10,7 +10,6 @@
 #define LLVM_LIBC_UTILS_MPFRWRAPPER_MPFRUTILS_H
 
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/CPP/type_traits/type_identity.h"
 #include "test/UnitTest/RoundingModeUtils.h"
 #include "test/UnitTest/Test.h"
 

>From b112ac4a94b2969bf400ee2a156bcba9d5c27b7f Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 01:19:47 +0200
Subject: [PATCH 04/10] [libc] Add blank line between template declaration and
 specialization

---
 libc/utils/MPFRWrapper/MPFRUtils.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 9221b4a457454..0b4f42a72ec81 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -138,6 +138,7 @@ template <typename T> struct IsTernaryInput<TernaryInput<T>> {
 };
 
 template <typename T> struct MakeScalarInput : cpp::type_identity<T> {};
+
 template <typename T>
 struct MakeScalarInput<TernaryInput<T>> : cpp::type_identity<T> {};
 

>From 7e7c3542e7c1730502bc761db5e0236fa8bd24e7 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 15:10:27 +0200
Subject: [PATCH 05/10] [libc][math] Remove useless header guard around
 fputil::generic::fma<float>

---
 libc/src/__support/FPUtil/generic/FMA.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 3bbb30476e5d8..4cbbe9cbb81e7 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -31,7 +31,6 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<OutType> &&
                              OutType>
 fma(InType x, InType y, InType z);
 
-#ifndef LIBC_TARGET_CPU_HAS_FMA
 // TODO(lntue): Implement fmaf that is correctly rounded to all rounding modes.
 // The implementation below only is only correct for the default rounding mode,
 // round-to-nearest tie-to-even.
@@ -82,7 +81,6 @@ template <> LIBC_INLINE float fma<float>(float x, float y, float z) {
 
   return static_cast<float>(bit_sum.get_val());
 }
-#endif // LIBC_TARGET_CPU_HAS_FMA
 
 namespace internal {
 

>From e409c828618637891af4ae4bffec2d03a6298a51 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 15:38:31 +0200
Subject: [PATCH 06/10] [libc] Misc cleanup

---
 libc/spec/stdc.td                       | 5 ++---
 libc/src/__support/FPUtil/generic/FMA.h | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index b089b596b0958..b7375fb411220 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -474,10 +474,11 @@ def StdC : StandardSpec<"stdc"> {
 
           FunctionSpec<"fmul", RetValSpec<FloatType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
 
-
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
+          GuardedFunctionSpec<"f16fmaf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
+
           FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"fmodl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
@@ -715,8 +716,6 @@ def StdC : StandardSpec<"stdc"> {
 
           GuardedFunctionSpec<"totalordermagf16", RetValSpec<IntType>, [ArgSpec<Float16Ptr>, ArgSpec<Float16Ptr>], "LIBC_TYPES_HAS_FLOAT16">,
 
-          GuardedFunctionSpec<"f16fmaf", RetValSpec<Float16Type>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
-
           GuardedFunctionSpec<"f16sqrtf", RetValSpec<Float16Type>, [ArgSpec<FloatType>], "LIBC_TYPES_HAS_FLOAT16">,
       ]
   >;
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 4cbbe9cbb81e7..d954006f8de69 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -113,7 +113,7 @@ fma(InType x, InType y, InType z) {
   using InStorageType = typename InFPBits::StorageType;
 
   constexpr int IN_EXPLICIT_MANT_LEN = InFPBits::FRACTION_LEN + 1;
-  constexpr size_t PROD_LEN = 2 * (IN_EXPLICIT_MANT_LEN);
+  constexpr size_t PROD_LEN = 2 * IN_EXPLICIT_MANT_LEN;
   constexpr size_t TMP_RESULT_LEN = cpp::bit_ceil(PROD_LEN + 1);
   using TmpResultType = UInt<TMP_RESULT_LEN>;
 

>From bd971fe9e52be5a5059a9b9c5f51b70c6569f3a8 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 15:51:01 +0200
Subject: [PATCH 07/10] [libc] Replace 128-bit BigInt numeric_limits with
 generic BigInt specialization

---
 libc/src/__support/big_int.h | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index 27cb3b783470c..c30c3ece54a30 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -986,26 +986,15 @@ using Int = BigInt<Bits, true, internal::WordTypeSelectorT<Bits>>;
 // Provides limits of BigInt.
 template <size_t Bits, bool Signed, typename T>
 struct cpp::numeric_limits<BigInt<Bits, Signed, T>> {
-  LIBC_INLINE_VAR static constexpr int digits = Bits - Signed;
-};
-
-// Provides limits of U/Int<128>.
-template <> class cpp::numeric_limits<UInt<128>> {
-public:
-  LIBC_INLINE static constexpr UInt<128> max() { return UInt<128>::max(); }
-  LIBC_INLINE static constexpr UInt<128> min() { return UInt<128>::min(); }
-  // Meant to match std::numeric_limits interface.
-  // NOLINTNEXTLINE(readability-identifier-naming)
-  LIBC_INLINE_VAR static constexpr int digits = 128;
-};
-
-template <> class cpp::numeric_limits<Int<128>> {
-public:
-  LIBC_INLINE static constexpr Int<128> max() { return Int<128>::max(); }
-  LIBC_INLINE static constexpr Int<128> min() { return Int<128>::min(); }
+  LIBC_INLINE static constexpr BigInt<Bits, Signed, T> max() {
+    return BigInt<Bits, Signed, T>::max();
+  }
+  LIBC_INLINE static constexpr BigInt<Bits, Signed, T> min() {
+    return BigInt<Bits, Signed, T>::min();
+  }
   // Meant to match std::numeric_limits interface.
   // NOLINTNEXTLINE(readability-identifier-naming)
-  LIBC_INLINE_VAR static constexpr int digits = 128;
+  LIBC_INLINE_VAR static constexpr int digits = Bits - Signed;
 };
 
 // type traits to determine whether a T is a BigInt.
@@ -1086,6 +1075,8 @@ struct is_unsigned_integral_or_big_int
           cpp::is_same_v<T, make_integral_or_big_int_unsigned_t<T>>> {};
 
 template <typename T>
+// Meant to look like <type_traits> helper variable templates.
+// NOLINTNEXTLINE(readability-identifier-naming)
 LIBC_INLINE_VAR constexpr bool is_unsigned_integral_or_big_int_v =
     is_unsigned_integral_or_big_int<T>::value;
 

>From 5dae03b46e367836b4403049ef4edb5d71112646 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 17:17:12 +0200
Subject: [PATCH 08/10] [libc][math] Fix sign of FMA with exact cancellation

---
 libc/src/__support/FPUtil/generic/FMA.h | 11 ++++++++---
 libc/test/src/math/smoke/FmaTest.h      | 24 +++++++++++++++++++++---
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index d954006f8de69..754efc0f5c21d 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -231,6 +231,8 @@ fma(InType x, InType y, InType z) {
   OutStorageType result = 0;
   int r_exp = 0; // Unbiased exponent of the result
 
+  int round_mode = fputil::quick_get_round();
+
   // Normalize the result.
   if (prod_mant != 0) {
     int lead_zeros = cpp::countl_zero(prod_mant);
@@ -269,12 +271,15 @@ fma(InType x, InType y, InType z) {
       r_exp = 0;
     }
   } else {
-    // Return +0.0 when there is exact cancellation, i.e., x*y == -z exactly.
-    prod_sign = Sign::POS;
+    // When there is exact cancellation, i.e., x*y == -z exactly, return -0.0 if
+    // rounding downward and +0.0 for other rounding modes.
+    if (round_mode == FE_DOWNWARD)
+      prod_sign = Sign::NEG;
+    else
+      prod_sign = Sign::POS;
   }
 
   // Finalize the result.
-  int round_mode = fputil::quick_get_round();
   if (LIBC_UNLIKELY(r_exp >= OutFPBits::MAX_BIASED_EXPONENT)) {
     if ((round_mode == FE_TOWARDZERO) ||
         (round_mode == FE_UPWARD && prod_sign.is_neg()) ||
diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h
index 3edd3aceb4847..f942de37654dd 100644
--- a/libc/test/src/math/smoke/FmaTest.h
+++ b/libc/test/src/math/smoke/FmaTest.h
@@ -58,6 +58,7 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
       EXPECT_FP_EQ(out.zero,
                    func(InType(0.5), in.min_denormal, in.min_denormal));
     }
+
     // Test underflow rounding down.
     OutType v = OutFPBits(static_cast<OutStorageType>(OUT_MIN_NORMAL_U +
                                                       OutStorageType(1)))
@@ -73,12 +74,29 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
           out.min_normal,
           func(InType(1) / InType(IN_MIN_NORMAL_U << 1), v, out.min_normal));
     }
+
     // Test overflow.
     OutType z = out.max_normal;
-    EXPECT_FP_EQ(OutType(0.75) * z, func(InType(1.75), z, -z));
+    EXPECT_FP_EQ_ALL_ROUNDING(OutType(0.75) * z, func(InType(1.75), z, -z));
+
     // Exact cancellation.
-    EXPECT_FP_EQ(out.zero, func(InType(3.0), InType(5.0), -InType(15.0)));
-    EXPECT_FP_EQ(out.zero, func(InType(-3.0), InType(5.0), InType(15.0)));
+    EXPECT_FP_EQ_ROUNDING_NEAREST(
+        out.zero, func(InType(3.0), InType(5.0), InType(-15.0)));
+    EXPECT_FP_EQ_ROUNDING_UPWARD(out.zero,
+                                 func(InType(3.0), InType(5.0), InType(-15.0)));
+    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(
+        out.zero, func(InType(3.0), InType(5.0), InType(-15.0)));
+    EXPECT_FP_EQ_ROUNDING_DOWNWARD(
+        out.neg_zero, func(InType(3.0), InType(5.0), InType(-15.0)));
+
+    EXPECT_FP_EQ_ROUNDING_NEAREST(
+        out.zero, func(InType(-3.0), InType(5.0), InType(15.0)));
+    EXPECT_FP_EQ_ROUNDING_UPWARD(out.zero,
+                                 func(InType(-3.0), InType(5.0), InType(15.0)));
+    EXPECT_FP_EQ_ROUNDING_TOWARD_ZERO(
+        out.zero, func(InType(-3.0), InType(5.0), InType(15.0)));
+    EXPECT_FP_EQ_ROUNDING_DOWNWARD(
+        out.neg_zero, func(InType(-3.0), InType(5.0), InType(15.0)));
   }
 };
 

>From 9d4dc22bb68c02a43123cd8fc6732cf0bd9eac7f Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 17:34:32 +0200
Subject: [PATCH 09/10] [libc] Remove braces from single-statement ifs as per
 LLVM code style

---
 libc/src/__support/FPUtil/generic/FMA.h | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 754efc0f5c21d..950db8c1ac671 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -71,11 +71,10 @@ template <> LIBC_INLINE float fma<float>(float x, float y, float z) {
     // Update sticky bits if t != 0.0 and the least (52 - 23 - 1 = 28) bits are
     // zero.
     if (!t.is_zero() && ((bit_sum.get_mantissa() & 0xfff'ffffULL) == 0)) {
-      if (bit_sum.sign() != t.sign()) {
+      if (bit_sum.sign() != t.sign())
         bit_sum.set_mantissa(bit_sum.get_mantissa() + 1);
-      } else if (bit_sum.get_mantissa()) {
+      else if (bit_sum.get_mantissa())
         bit_sum.set_mantissa(bit_sum.get_mantissa() - 1);
-      }
     }
   }
 
@@ -122,9 +121,8 @@ fma(InType x, InType y, InType z) {
   constexpr TmpResultType EXTRA_FRACTION_STICKY_MASK =
       (TmpResultType(1) << (EXTRA_FRACTION_LEN - 1)) - 1;
 
-  if (LIBC_UNLIKELY(x == 0 || y == 0 || z == 0)) {
+  if (LIBC_UNLIKELY(x == 0 || y == 0 || z == 0))
     return static_cast<OutType>(x * y + z);
-  }
 
   int x_exp = 0;
   int y_exp = 0;
@@ -293,9 +291,8 @@ fma(InType x, InType y, InType z) {
   result = static_cast<OutStorageType>(
       (result & OutFPBits::FRACTION_MASK) |
       (static_cast<OutStorageType>(r_exp) << OutFPBits::FRACTION_LEN));
-  if (prod_sign.is_neg()) {
+  if (prod_sign.is_neg())
     result |= OutFPBits::SIGN_MASK;
-  }
 
   // Rounding.
   if (round_mode == FE_TONEAREST) {

>From eb0ff6bb8809f47358be7c0dece4f212b365ffb9 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty at gmail.com>
Date: Fri, 14 Jun 2024 17:36:07 +0200
Subject: [PATCH 10/10] [libc][math] Remove test_special_numbers from FMA MPFR
 unit test duplicated from smoke test

---
 libc/test/src/math/FmaTest.h        | 41 -----------------------------
 libc/test/src/math/f16fmaf_test.cpp |  4 ---
 libc/test/src/math/fma_test.cpp     |  4 ---
 libc/test/src/math/fmaf_test.cpp    |  4 ---
 4 files changed, 53 deletions(-)

diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 0e0f05aad5d13..53895e7d633c2 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -59,47 +59,6 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::FEnvSafeTest {
 public:
   using FmaFunc = OutType (*)(InType, InType, InType);
 
-  void test_special_numbers(FmaFunc func) {
-    EXPECT_FP_EQ(out.zero, func(in.zero, in.zero, in.zero));
-    EXPECT_FP_EQ(out.neg_zero, func(in.zero, in.neg_zero, in.neg_zero));
-    EXPECT_FP_EQ(out.inf, func(in.inf, in.inf, in.zero));
-    EXPECT_FP_EQ(out.neg_inf, func(in.neg_inf, in.inf, in.neg_inf));
-    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.zero, in.zero));
-    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.neg_inf, in.inf));
-    EXPECT_FP_EQ(out.aNaN, func(in.aNaN, in.zero, in.inf));
-    EXPECT_FP_EQ(out.aNaN, func(in.inf, in.neg_inf, in.aNaN));
-
-    // Test underflow rounding up.
-    EXPECT_FP_EQ(OutFPBits(OutStorageType(2)).get_val(),
-                 func(OutType(0.5), out.min_denormal, out.min_denormal));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      EXPECT_FP_EQ(out.zero,
-                   func(InType(0.5), in.min_denormal, in.min_denormal));
-    }
-    // Test underflow rounding down.
-    OutType v = OutFPBits(static_cast<OutStorageType>(OUT_MIN_NORMAL_U +
-                                                      OutStorageType(1)))
-                    .get_val();
-    EXPECT_FP_EQ(v, func(OutType(1) / OutType(OUT_MIN_NORMAL_U << 1), v,
-                         out.min_normal));
-
-    if constexpr (sizeof(OutType) < sizeof(InType)) {
-      InType v = InFPBits(static_cast<InStorageType>(IN_MIN_NORMAL_U +
-                                                     InStorageType(1)))
-                     .get_val();
-      EXPECT_FP_EQ(
-          out.min_normal,
-          func(InType(1) / InType(IN_MIN_NORMAL_U << 1), v, out.min_normal));
-    }
-    // Test overflow.
-    OutType z = out.max_normal;
-    EXPECT_FP_EQ(OutType(0.75) * z, func(InType(1.75), z, -z));
-    // Exact cancellation.
-    EXPECT_FP_EQ(out.zero, func(InType(3.0), InType(5.0), -InType(15.0)));
-    EXPECT_FP_EQ(out.zero, func(InType(-3.0), InType(5.0), InType(15.0)));
-  }
-
   void test_subnormal_range(FmaFunc func) {
     constexpr InStorageType COUNT = 100'001;
     constexpr InStorageType STEP =
diff --git a/libc/test/src/math/f16fmaf_test.cpp b/libc/test/src/math/f16fmaf_test.cpp
index dc197e297c11e..e4ca88b8810e1 100644
--- a/libc/test/src/math/f16fmaf_test.cpp
+++ b/libc/test/src/math/f16fmaf_test.cpp
@@ -12,10 +12,6 @@
 
 using LlvmLibcF16fmafTest = FmaTestTemplate<float16, float>;
 
-TEST_F(LlvmLibcF16fmafTest, SpecialNumbers) {
-  test_special_numbers(&LIBC_NAMESPACE::f16fmaf);
-}
-
 TEST_F(LlvmLibcF16fmafTest, SubnormalRange) {
   test_subnormal_range(&LIBC_NAMESPACE::f16fmaf);
 }
diff --git a/libc/test/src/math/fma_test.cpp b/libc/test/src/math/fma_test.cpp
index 20224d99894be..dd761382631d5 100644
--- a/libc/test/src/math/fma_test.cpp
+++ b/libc/test/src/math/fma_test.cpp
@@ -276,10 +276,6 @@ struct LlvmLibcFmaTest : public FmaTestTemplate<double> {
   }
 };
 
-TEST_F(LlvmLibcFmaTest, SpecialNumbers) {
-  test_special_numbers(&LIBC_NAMESPACE::fma);
-}
-
 TEST_F(LlvmLibcFmaTest, SubnormalRange) {
   test_subnormal_range(&LIBC_NAMESPACE::fma);
 }
diff --git a/libc/test/src/math/fmaf_test.cpp b/libc/test/src/math/fmaf_test.cpp
index b607d4a66f8eb..0e498d46ecfb0 100644
--- a/libc/test/src/math/fmaf_test.cpp
+++ b/libc/test/src/math/fmaf_test.cpp
@@ -12,10 +12,6 @@
 
 using LlvmLibcFmafTest = FmaTestTemplate<float>;
 
-TEST_F(LlvmLibcFmafTest, SpecialNumbers) {
-  test_special_numbers(&LIBC_NAMESPACE::fmaf);
-}
-
 TEST_F(LlvmLibcFmafTest, SubnormalRange) {
   test_subnormal_range(&LIBC_NAMESPACE::fmaf);
 }