[libc-commits] [libc] b8e8012 - [libc][math] fmod/fmodf implementation.

Fri Jun 24 14:09:47 PDT 2022

Author: Kirill Okhotnikov
Date: 2022-06-24T23:09:14+02:00
New Revision: b8e8012aa2ede55be77173b918b4d6363b55c9d1

URL: https://github.com/llvm/llvm-project/commit/b8e8012aa2ede55be77173b918b4d6363b55c9d1
DIFF: https://github.com/llvm/llvm-project/commit/b8e8012aa2ede55be77173b918b4d6363b55c9d1.diff

LOG: [libc][math] fmod/fmodf implementation.

This is a implementation of find remainder fmod function from standard libm.
The underline algorithm is developed by myself, but probably it was first
invented before.
Some features of the implementation:
1. The code is written on more-or-less modern C++.
2. One general implementation for both float and double precision numbers.
3. Spitted platform/architecture dependent and independent code and tests.
4. Tests covers 100% of the code for both float and double numbers. Tests cases with NaN/Inf etc is copied from glibc.
5. The new implementation in general 2-4 times faster for “regular” x,y values. It can be 20 times faster for x/y huge value, but can also be 2 times slower for double denormalized range (according to perf tests provided).
6. Two different implementation of division loop are provided. In some platforms division can be very time consuming operation. Depend on platform it can be 3-10 times slower than multiplication.

Performance tests:

The test is based on core-math project (https://gitlab.inria.fr/core-math/core-math). By Tue Ly suggestion I took hypot function and use it as template for fmod. Preserving all test cases.

`./check.sh <--special|--worst> fmodf` passed.
`CORE_MATH_PERF_MODE=rdtsc ./perf.sh fmodf` results are

```
GNU libc version: 2.35
GNU libc release: stable
21.166 <-- FPU
51.031 <-- current glibc
37.659 <-- this fmod version.
```

Added: 
    libc/src/__support/FPUtil/generic/FMod.h
    libc/src/math/fmod.h
    libc/src/math/fmodf.h
    libc/src/math/generic/fmod.cpp
    libc/src/math/generic/fmodf.cpp
    libc/test/src/math/FModTest.h
    libc/test/src/math/differential_testing/fmod_diff.cpp
    libc/test/src/math/differential_testing/fmod_perf.cpp
    libc/test/src/math/differential_testing/fmodf_diff.cpp
    libc/test/src/math/differential_testing/fmodf_perf.cpp
    libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
    libc/test/src/math/fmod_test.cpp
    libc/test/src/math/fmodf_test.cpp

Modified: 
    libc/config/darwin/arm/entrypoints.txt
    libc/config/linux/aarch64/entrypoints.txt
    libc/config/linux/x86_64/entrypoints.txt
    libc/config/windows/entrypoints.txt
    libc/docs/math.rst
    libc/spec/stdc.td
    libc/src/__support/FPUtil/FPBits.h
    libc/src/__support/FPUtil/Hypot.h
    libc/src/__support/FPUtil/builtin_wrappers.h
    libc/src/__support/FPUtil/generic/CMakeLists.txt
    libc/src/__support/FPUtil/generic/FMA.h
    libc/src/__support/FPUtil/generic/sqrt.h
    libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
    libc/src/__support/str_to_float.h
    libc/src/math/CMakeLists.txt
    libc/src/math/generic/CMakeLists.txt
    libc/test/src/math/CMakeLists.txt
    libc/test/src/math/differential_testing/CMakeLists.txt
    libc/test/src/math/exhaustive/CMakeLists.txt
    libc/utils/MPFRWrapper/MPFRUtils.cpp
    libc/utils/MPFRWrapper/MPFRUtils.h

Removed: 
    


################################################################################
diff  --git a/libc/config/darwin/arm/entrypoints.txt b/libc/config/darwin/arm/entrypoints.txt
index a8f882f04cc21..5c44d6a20c023 100644

--- a/libc/config/darwin/arm/entrypoints.txt
+++ b/libc/config/darwin/arm/entrypoints.txt
@@ -131,6 +131,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmin
     libc.src.math.fminf
     libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl

diff  --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 01b4fac24af78..7b1abab3b395e 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -150,6 +150,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmin
     libc.src.math.fminf
     libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl

diff  --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index c744e37fae49d..e6a63ec2cd5b5 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -156,6 +156,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmax
     libc.src.math.fmaxf
     libc.src.math.fmaxl
+    libc.src.math.fmod
+    libc.src.math.fmodf
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl

diff  --git a/libc/config/windows/entrypoints.txt b/libc/config/windows/entrypoints.txt
index 8d7cf6713c86e..503c1f120da23 100644
--- a/libc/config/windows/entrypoints.txt
+++ b/libc/config/windows/entrypoints.txt
@@ -133,6 +133,8 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.math.fmax
     libc.src.math.fmaxf
     libc.src.math.fmaxl
+    libc.src.math.fmod
+    libc.src.math.fmodf
     libc.src.math.frexp
     libc.src.math.frexpf
     libc.src.math.frexpl

diff  --git a/libc/docs/math.rst b/libc/docs/math.rst
index 2603ffb1ff736..42c402ffda2a8 100644
--- a/libc/docs/math.rst
+++ b/libc/docs/math.rst
@@ -76,7 +76,7 @@ fdim           |check|          |check|         |check|
 floor          |check|          |check|         |check|  
 fmax           |check|          |check|         |check|  
 fmin           |check|          |check|         |check|  
-fmod
+fmod           |check|          |check|
 fpclassify
 frexp          |check|          |check|         |check|  
 ilogb          |check|          |check|         |check|  

diff  --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 90218443d0991..72809cc694cf2 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -378,6 +378,10 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"fmaf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>, ArgSpec<FloatType>]>,
 
+          FunctionSpec<"fmod", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
+
+          FunctionSpec<"fmodf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
+
           FunctionSpec<"frexp", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<IntPtr>]>,
           FunctionSpec<"frexpl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<IntPtr>]>,

diff  --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 6743fee9b1a48..a30e41f2d0467 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -13,6 +13,8 @@
 
 #include "src/__support/CPP/Bit.h"
 #include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/builtin_wrappers.h"
+#include "src/__support/common.h"
 
 #include "FloatProperties.h"
 #include <stdint.h>
@@ -160,6 +162,33 @@ template <typename T> struct FPBits {
     bits.set_mantissa(v);
     return T(bits);
   }
+
+  // The function convert integer number and unbiased exponent to proper float
+  // T type:
+  //   Result = number * 2^(ep+1 - exponent_bias)
+  // Be careful!
+  //   1) "ep" is raw exponent value.
+  //   2) The function add to +1 to ep for seamless normalized to denormalized
+  //      transition.
+  //   3) The function did not check exponent high limit.
+  //   4) "number" zero value is not processed correctly.
+  //   5) Number is unsigned, so the result can be only positive.
+  inline static constexpr FPBits<T> make_value(UIntType number, int ep) {
+    FPBits<T> result;
+    // offset: +1 for sign, but -1 for implicit first bit
+    int lz = fputil::unsafe_clz(number) - FloatProp::EXPONENT_WIDTH;
+    number <<= lz;
+    ep -= lz;
+
+    if (likely(ep >= 0)) {
+      // Implicit number bit will be removed by mask
+      result.set_mantissa(number);
+      result.set_unbiased_exponent(ep + 1);
+    } else {
+      result.set_mantissa(number >> -ep);
+    }
+    return result;
+  }
 };
 
 } // namespace fputil

diff  --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h
index 0e3cefeeca576..c4eb0abe6de75 100644
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@@ -26,7 +26,7 @@ template <typename T>
 static inline T find_leading_one(T mant, int &shift_length) {
   shift_length = 0;
   if (mant > 0) {
-    shift_length = (sizeof(mant) * 8) - 1 - clz(mant);
+    shift_length = (sizeof(mant) * 8) - 1 - unsafe_clz(mant);
   }
   return T(1) << shift_length;
 }

diff  --git a/libc/src/__support/FPUtil/builtin_wrappers.h b/libc/src/__support/FPUtil/builtin_wrappers.h
index 723299592d808..3924610bc7168 100644
--- a/libc/src/__support/FPUtil/builtin_wrappers.h
+++ b/libc/src/__support/FPUtil/builtin_wrappers.h
@@ -17,6 +17,15 @@ namespace fputil {
 // __builtin_clz/ctz* rather than using the exactly-sized aliases from stdint.h.
 // This way, we can avoid making any assumptions about integer sizes and let the
 // compiler match for us.
+namespace __internal {
+
+template <typename T> static inline int correct_zero(T val, int bits) {
+  if (val == T(0))
+    return sizeof(T(0)) * 8;
+  else
+    return bits;
+}
+
 template <typename T> static inline int clz(T val);
 template <> inline int clz<unsigned int>(unsigned int val) {
   return __builtin_clz(val);
@@ -38,6 +47,23 @@ template <> inline int ctz<unsigned long int>(unsigned long int val) {
 template <> inline int ctz<unsigned long long int>(unsigned long long int val) {
   return __builtin_ctzll(val);
 }
+} // namespace __internal
+
+template <typename T> static inline int safe_ctz(T val) {
+  return __internal::correct_zero(val, __internal::ctz(val));
+}
+
+template <typename T> static inline int unsafe_ctz(T val) {
+  return __internal::ctz(val);
+}
+
+template <typename T> static inline int safe_clz(T val) {
+  return __internal::correct_zero(val, __internal::clz(val));
+}
+
+template <typename T> static inline int unsafe_clz(T val) {
+  return __internal::clz(val);
+}
 
 template <typename T> static inline bool isnan(T val) {
   return __builtin_isnan(val);

diff  --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
index 227bd601ddb39..f19b887dac18c 100644
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -14,3 +14,9 @@ add_header_library(
   DEPENDS
     libc.src.__support.CPP.uint128
 )
+
+add_header_library(
+  fmod
+  HDRS
+    FMod.h
+)

diff  --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index c487cca0ce115..b71f5bf64dc3d 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -207,8 +207,9 @@ template <> inline double fma<double>(double x, double y, double z) {
   // Normalize the result.
   if (prod_mant != 0) {
     uint64_t prod_hi = static_cast<uint64_t>(prod_mant >> 64);
-    int lead_zeros =
-        prod_hi ? clz(prod_hi) : 64 + clz(static_cast<uint64_t>(prod_mant));
+    int lead_zeros = prod_hi
+                         ? unsafe_clz(prod_hi)
+                         : 64 + unsafe_clz(static_cast<uint64_t>(prod_mant));
     // Move the leading 1 to the most significant bit.
     prod_mant <<= lead_zeros;
     // The lower 64 bits are always sticky bits after moving the leading 1 to

diff  --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
new file mode 100644
index 0000000000000..3a03f931657ab
--- /dev/null
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -0,0 +1,312 @@
+//===-- Common header for fmod implementations ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMOD_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMOD_H
+
+#include "src/__support/CPP/Limits.h"
+#include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/builtin_wrappers.h"
+#include "src/__support/common.h"
+#include "src/math/generic/math_utils.h"
+
+namespace __llvm_libc {
+namespace fputil {
+namespace generic {
+
+//  Objective:
+//    The  algorithm uses  integer arithmetic  (max uint64_t)  for general case.
+//    Some common  cases, like  abs(x) < abs(y)  or  abs(x) < 1000 *  abs(y) are
+//    treated specially to increase  performance.  The part of checking  special
+//    cases, numbers NaN, INF etc. treated separately.
+//
+//  Objective:
+//    1) FMod definition (https://cplusplus.com/reference/cmath/fmod/):
+//       fmod = numer - tquot * denom, where tquot is the truncated
+//       (i.e., rounded towards zero) result of: numer/denom.
+//    2) FMod with negative x and/or y can be trivially converted to fmod for
+//       positive x and y. Therefore the algorithm below works only with
+//       positive numbers.
+//    3) All positive floating point numbers can be represented as m * 2^e,
+//       where "m" is positive integer and "e" is signed.
+//    4) FMod function can be calculated in integer numbers (x > y):
+//         fmod = m_x * 2^e_x - tquot * m_y * 2^e_y
+//              = 2^e_y * (m_x * 2^(e_x - e^y) - tquot * m_y).
+//       All variables in parentheses are unsigned integers.
+//
+//  Mathematical background:
+//    Input x,y in the algorithm is represented (mathematically) like m_x*2^e_x
+//    and m_y*2^e_y. This is an ambiguous number representation. For example:
+//      m * 2^e = (2 * m) * 2^(e-1)
+//    The algorithm uses the facts that
+//      r = a % b = (a % (N * b)) % b,
+//      (a * c) % (b * c) = (a % b) * c
+//    where N is positive  integer number. a, b and c - positive. Let's  adopt
+//    the formula for representation above.
+//      a = m_x * 2^e_x, b = m_y * 2^e_y, N = 2^k
+//      r(k) = a % b = (m_x * 2^e_x) % (2^k * m_y * 2^e_y)
+//           = 2^(e_y + k) * (m_x * 2^(e_x - e_y - k) % m_y)
+//      r(k) = m_r * 2^e_r = (m_x % m_y) * 2^(m_y + k)
+//           = (2^p * (m_x % m_y) * 2^(e_y + k - p))
+//        m_r = 2^p * (m_x % m_y), e_r = m_y + k - p
+//
+//  Algorithm description:
+//  First, let write x = m_x * 2^e_x and y = m_y * 2^e_y with m_x, m_y, e_x, e_y
+//  are integers (m_x amd m_y positive).
+//  Then the naive  implementation of the fmod function with a simple
+//  for/while loop:
+//      while (e_x > e_y) {
+//        m_x *= 2; --e_x; //  m_x * 2^e_x == 2 * m_x * 2^(e_x - 1)
+//        m_x %= m_y;
+//      }
+//  On the other hand, the algorithm exploits the fact that m_x, m_y are the
+//  mantissas of floating point numbers, which use less bits than the storage
+//  integers: 24 / 32 for floats and 53 / 64 for doubles, so if in each step of
+//  the iteration, we can left shift m_x as many bits as the storage integer
+//  type can hold, the exponent reduction per step will be at least 32 - 24 = 8
+//  for floats and 64 - 53 = 11 for doubles (double example below):
+//      while (e_x > e_y) {
+//        m_x <<= 11; e_x -= 11; //  m_x * 2^e_x == 2^11 * m_x * 2^(e_x - 11)
+//        m_x %= m_y;
+//      }
+//  Some extra improvements are done:
+//    1) Shift m_y maximum to the right, which can significantly improve
+//       performance for small integer numbers (y = 3 for example).
+//       The m_x shift in the loop can be 62 instead of 11 for double.
+//    2) For some architectures with very slow division, it can be better to
+//       calculate inverse value ones, and after do multiplication in the loop.
+//    3) "likely" special cases are treated specially to improve performance.
+//
+//  Simple example:
+//  The examples below use byte for simplicity.
+//    1) Shift hy maximum to right without losing bits and increase iy value
+//       m_y = 0b00101100 e_y = 20 after shift m_y = 0b00001011 e_y = 22.
+//    2) m_x = m_x % m_y.
+//    3) Move m_x maximum to left. Note that after (m_x = m_x % m_y) CLZ in m_x
+//    is not lower than CLZ in m_y. m_x=0b00001001 e_x = 100, m_x=0b10010000,
+//       e_x = 100-4 = 96.
+//    4) Repeat (2) until e_x == e_y.
+//
+//  Complexity analysis (double):
+//    Converting x,y to (m_x,e_x),(m_y, e_y): CTZ/shift/AND/OR/if. Loop  count:
+//      (m_x - m_y) / (64 -  "length of m_y").
+//      max("length of m_y")  = 53,
+//      max(e_x - e_y)  = 2048
+//    Maximum operation is  186. For rare "unrealistic" cases.
+//
+//  Special cases (double):
+//    Supposing  that  case  where |y| > 1e-292 and |x/y|<2000  is  very  common
+//    special processing is implemented. No m_y alignment, no loop:
+//      result = (m_x * 2^(e_x - e_y)) % m_y.
+//    When x and y are both subnormal (rare case but...) the
+//      result = m_x % m_y.
+//    Simplified conversion back to double.
+
+// Exceptional cases handler according to cppreference.com
+//    https://en.cppreference.com/w/cpp/numeric/math/fmod
+// and POSIX standard described in Linux man
+//   https://man7.org/linux/man-pages/man3/fmod.3p.html
+// C standard for the function is not full, so not by default (although it can
+// be implemented in another handler.
+template <typename T> struct FModExceptionalInputHandler {
+
+  static_assert(cpp::IsFloatingPointType<T>::Value,
+                "FModCStandardWrapper instantiated with invalid type.");
+
+  static bool PreCheck(T x, T y, T &out) {
+    if (likely(y != 0 && fputil::isfinite(y) && fputil::isfinite(x))) {
+      return false;
+    }
+
+    if (fputil::isnan(x) || fputil::isnan(y)) {
+      out = fputil::quiet_NaN(T(0));
+      return true;
+    }
+
+    if (fputil::isinf(x) || y == 0) {
+      fputil::set_except(FE_INVALID);
+      out = with_errno(fputil::quiet_NaN(T(0)), EDOM);
+      return true;
+    }
+
+    if (fputil::isinf(y)) {
+      out = x;
+      return true;
+    }
+
+    // case where x == 0
+    out = x;
+    return true;
+  }
+};
+
+template <typename T> struct FModFastMathWrapper {
+
+  static_assert(cpp::IsFloatingPointType<T>::Value,
+                "FModFastMathWrapper instantiated with invalid type.");
+
+  static bool PreCheck(T, T, T &) { return false; }
+};
+
+template <typename T> class FModDivisionSimpleHelper {
+private:
+  using intU_t = typename FPBits<T>::UIntType;
+
+public:
+  inline constexpr static intU_t execute(int exp_
diff , int sides_zeroes_count,
+                                         intU_t m_x, intU_t m_y) {
+    while (exp_
diff  > sides_zeroes_count) {
+      exp_
diff  -= sides_zeroes_count;
+      m_x <<= sides_zeroes_count;
+      m_x %= m_y;
+    }
+    m_x <<= exp_
diff ;
+    m_x %= m_y;
+    return m_x;
+  }
+};
+
+template <typename T> class FModDivisionInvMultHelper {
+private:
+  using FPB = FPBits<T>;
+  using intU_t = typename FPB::UIntType;
+
+public:
+  inline constexpr static intU_t execute(int exp_
diff , int sides_zeroes_count,
+                                         intU_t m_x, intU_t m_y) {
+    if (exp_
diff  > sides_zeroes_count) {
+      intU_t inv_hy = (cpp::NumericLimits<intU_t>::max() / m_y);
+      while (exp_
diff  > sides_zeroes_count) {
+        exp_
diff  -= sides_zeroes_count;
+        intU_t hd =
+            (m_x * inv_hy) >> (FPB::FloatProp::BIT_WIDTH - sides_zeroes_count);
+        m_x <<= sides_zeroes_count;
+        m_x -= hd * m_y;
+        while (unlikely(m_x > m_y))
+          m_x -= m_y;
+      }
+      intU_t hd = (m_x * inv_hy) >> (FPB::FloatProp::BIT_WIDTH - exp_
diff );
+      m_x <<= exp_
diff ;
+      m_x -= hd * m_y;
+      while (unlikely(m_x > m_y))
+        m_x -= m_y;
+    } else {
+      m_x <<= exp_
diff ;
+      m_x %= m_y;
+    }
+    return m_x;
+  }
+};
+
+template <typename T, class Wrapper = FModExceptionalInputHandler<T>,
+          class DivisionHelper = FModDivisionSimpleHelper<T>>
+class FMod {
+  static_assert(cpp::IsFloatingPointType<T>::Value,
+                "FMod instantiated with invalid type.");
+
+private:
+  using FPB = FPBits<T>;
+  using intU_t = typename FPB::UIntType;
+
+  inline static constexpr FPB eval_internal(FPB sx, FPB sy) {
+
+    if (likely(sx.uintval() <= sy.uintval())) {
+      if (sx.uintval() < sy.uintval())
+        return sx;        // |x|<|y| return x
+      return FPB::zero(); // |x|=|y| return 0.0
+    }
+
+    int e_x = sx.get_unbiased_exponent();
+    int e_y = sy.get_unbiased_exponent();
+
+    // Most common case where |y| is "very normal" and |x/y| < 2^EXPONENT_WIDTH
+    if (likely(e_y > int(FPB::FloatProp::MANTISSA_WIDTH) &&
+               e_x - e_y <= int(FPB::FloatProp::EXPONENT_WIDTH))) {
+      intU_t m_x = sx.get_explicit_mantissa();
+      intU_t m_y = sy.get_explicit_mantissa();
+      intU_t d = (e_x == e_y) ? (m_x - m_y) : (m_x << (e_x - e_y)) % m_y;
+      if (d == 0)
+        return FPB::zero();
+      // iy - 1 because of "zero power" for number with power 1
+      return FPB::make_value(d, e_y - 1);
+    }
+    /* Both subnormal special case. */
+    if (unlikely(e_x == 0 && e_y == 0)) {
+      FPB d;
+      d.set_mantissa(sx.uintval() % sy.uintval());
+      return d;
+    }
+
+    // Note that hx is not subnormal by conditions above.
+    intU_t m_x = sx.get_explicit_mantissa();
+    e_x--;
+
+    intU_t m_y = sy.get_explicit_mantissa();
+    int lead_zeros_m_y = FPB::FloatProp::EXPONENT_WIDTH;
+    if (likely(e_y > 0)) {
+      e_y--;
+    } else {
+      m_y = sy.get_mantissa();
+      lead_zeros_m_y = unsafe_clz(m_y);
+    }
+
+    // Assume hy != 0
+    int tail_zeros_m_y = unsafe_ctz(m_y);
+    int sides_zeroes_count = lead_zeros_m_y + tail_zeros_m_y;
+    // n > 0 by conditions above
+    int exp_
diff  = e_x - e_y;
+    {
+      // Shift hy right until the end or n = 0
+      int right_shift = exp_
diff  < tail_zeros_m_y ? exp_
diff  : tail_zeros_m_y;
+      m_y >>= right_shift;
+      exp_
diff  -= right_shift;
+      e_y += right_shift;
+    }
+
+    {
+      // Shift hx left until the end or n = 0
+      int left_shift = exp_
diff  < int(FPB::FloatProp::EXPONENT_WIDTH)
+                           ? exp_
diff 
+                           : FPB::FloatProp::EXPONENT_WIDTH;
+      m_x <<= left_shift;
+      exp_
diff  -= left_shift;
+    }
+
+    m_x %= m_y;
+    if (unlikely(m_x == 0))
+      return FPB::zero();
+
+    if (exp_
diff  == 0)
+      return FPB::make_value(m_x, e_y);
+
+    /* hx next can't be 0, because hx < hy, hy % 2 == 1 hx * 2^i % hy != 0 */
+    m_x = DivisionHelper::execute(exp_
diff , sides_zeroes_count, m_x, m_y);
+    return FPB::make_value(m_x, e_y);
+  }
+
+public:
+  static inline T eval(T x, T y) {
+    if (T out; Wrapper::PreCheck(x, y, out))
+      return out;
+    FPB sx(x), sy(y);
+    bool sign = sx.get_sign();
+    sx.set_sign(false);
+    sy.set_sign(false);
+    FPB result = eval_internal(sx, sy);
+    result.set_sign(sign);
+    return result.get_val();
+  }
+};
+
+} // namespace generic
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMOD_H

diff  --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index 6bf1bfc600f13..ed6b624c6a0eb 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -36,8 +36,8 @@ template <> struct SpecialLongDouble<long double> {
 template <typename T>
 static inline void normalize(int &exponent,
                              typename FPBits<T>::UIntType &mantissa) {
-  const int shift =
-      clz(mantissa) - (8 * sizeof(mantissa) - 1 - MantissaWidth<T>::VALUE);
+  const int shift = unsafe_clz(mantissa) -
+                    (8 * sizeof(mantissa) - 1 - MantissaWidth<T>::VALUE);
   exponent -= shift;
   mantissa <<= shift;
 }

diff  --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index 2700ba3e12774..c460b42ae9584 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -21,7 +21,7 @@ namespace x86 {
 
 inline void normalize(int &exponent, UInt128 &mantissa) {
   const int shift =
-      clz(static_cast<uint64_t>(mantissa)) -
+      unsafe_clz(static_cast<uint64_t>(mantissa)) -
       (8 * sizeof(uint64_t) - 1 - MantissaWidth<long double>::VALUE);
   exponent -= shift;
   mantissa <<= shift;

diff  --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index a4c6b485a282d..1fff9373be2b3 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -52,11 +52,11 @@ template <class T> uint32_t inline leading_zeroes(T inputNumber) {
 }
 
 template <> uint32_t inline leading_zeroes<uint32_t>(uint32_t inputNumber) {
-  return inputNumber == 0 ? 32 : fputil::clz(inputNumber);
+  return fputil::safe_clz(inputNumber);
 }
 
 template <> uint32_t inline leading_zeroes<uint64_t>(uint64_t inputNumber) {
-  return inputNumber == 0 ? 64 : fputil::clz(inputNumber);
+  return fputil::safe_clz(inputNumber);
 }
 
 static inline uint64_t low64(const UInt128 &num) {

diff  --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 60ccfb2159a1f..c3376310e9d46 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -103,6 +103,9 @@ add_math_entrypoint_object(fmin)
 add_math_entrypoint_object(fminf)
 add_math_entrypoint_object(fminl)
 
+add_math_entrypoint_object(fmod)
+add_math_entrypoint_object(fmodf)
+
 add_math_entrypoint_object(frexp)
 add_math_entrypoint_object(frexpf)
 add_math_entrypoint_object(frexpl)

diff  --git a/libc/src/math/fmod.h b/libc/src/math/fmod.h
new file mode 100644
index 0000000000000..a79ff018ec641
--- /dev/null
+++ b/libc/src/math/fmod.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmod --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMOD_H
+#define LLVM_LIBC_SRC_MATH_FMOD_H
+
+namespace __llvm_libc {
+
+double fmod(double x, double y);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FMOD_H

diff  --git a/libc/src/math/fmodf.h b/libc/src/math/fmodf.h
new file mode 100644
index 0000000000000..ab9c4aee61178
--- /dev/null
+++ b/libc/src/math/fmodf.h
@@ -0,0 +1,18 @@
+//===-- Implementation header for fmodf -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_FMODF_H
+#define LLVM_LIBC_SRC_MATH_FMODF_H
+
+namespace __llvm_libc {
+
+float fmodf(float x, float y);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_MATH_FMODF_H

diff  --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 72b8e6b7a1aed..1a693e6d5c2c9 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1090,3 +1090,29 @@ add_object_library(
   COMPILE_OPTIONS
    -O3
 )
+
+add_entrypoint_object(
+  fmod
+  SRCS
+    fmod.cpp
+  HDRS
+    ../fmod.h
+  DEPENDS
+    libc.include.math
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
+
+add_entrypoint_object(
+  fmodf
+  SRCS
+    fmodf.cpp
+  HDRS
+    ../fmodf.h
+  DEPENDS
+    libc.include.math
+    libc.src.__support.FPUtil.generic.fmod
+  COMPILE_OPTIONS
+    -O3
+)
\ No newline at end of file

diff  --git a/libc/src/math/generic/fmod.cpp b/libc/src/math/generic/fmod.cpp
new file mode 100644
index 0000000000000..563a1644aa75d
--- /dev/null
+++ b/libc/src/math/generic/fmod.cpp
@@ -0,0 +1,19 @@
+//===-- Double-precision fmod function ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmod.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(double, fmod, (double x, double y)) {
+  return fputil::generic::FMod<double>::eval(x, y);
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/src/math/generic/fmodf.cpp b/libc/src/math/generic/fmodf.cpp
new file mode 100644
index 0000000000000..dca476c183b0c
--- /dev/null
+++ b/libc/src/math/generic/fmodf.cpp
@@ -0,0 +1,19 @@
+//===-- Single-precision fmodf function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/fmodf.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "src/__support/common.h"
+
+namespace __llvm_libc {
+
+LLVM_LIBC_FUNCTION(float, fmodf, (float x, float y)) {
+  return fputil::generic::FMod<float>::eval(x, y);
+}
+
+} // namespace __llvm_libc

diff  --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index ddc02b2fc7786..971a73d3d84b0 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1271,6 +1271,34 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fputil
 )
 
+add_fp_unittest(
+  fmodf_test
+  SUITE
+    libc_math_unittests
+  SRCS
+    fmodf_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmodf
+    libc.src.__support.FPUtil.fputil
+)
+
+add_fp_unittest(
+  fmod_test
+  SUITE
+    libc_math_unittests
+  SRCS
+    fmod_test.cpp
+  HDRS
+    FModTest.h
+  DEPENDS
+    libc.include.math
+    libc.src.math.fmod
+    libc.src.__support.FPUtil.fputil
+)
+
 add_subdirectory(generic)
 add_subdirectory(exhaustive)
 add_subdirectory(
diff erential_testing)

diff  --git a/libc/test/src/math/FModTest.h b/libc/test/src/math/FModTest.h
new file mode 100644
index 0000000000000..4962194d939f4
--- /dev/null
+++ b/libc/test/src/math/FModTest.h
@@ -0,0 +1,270 @@
+//===-- Utility class to test fmod special numbers ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
+#define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H
+
+#include "src/__support/FPUtil/BasicOperations.h"
+#include "src/__support/FPUtil/NearestIntegerOperations.h"
+#include "utils/UnitTest/FPMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+#include <limits>
+#include <math.h>
+
+#define TEST_SPECIAL(x, y, expected, dom_err, expected_exception)              \
+  EXPECT_FP_EQ(expected, f(x, y));                                             \
+  EXPECT_MATH_ERRNO((dom_err) ? EDOM : 0);                                     \
+  EXPECT_FP_EXCEPTION(expected_exception);                                     \
+  __llvm_libc::fputil::clear_except(FE_ALL_EXCEPT)
+
+#define TEST_REGULAR(x, y, expected) TEST_SPECIAL(x, y, expected, false, 0)
+
+template <typename T> class FmodTest : public __llvm_libc::testing::Test {
+
+  DECLARE_SPECIAL_CONSTANTS(T)
+
+public:
+  typedef T (*FModFunc)(T, T);
+
+  void testSpecialNumbers(FModFunc f) {
+    using nl = std::numeric_limits<T>;
+
+    // fmod (+0, y) == +0 for y != 0.
+    TEST_SPECIAL(0.0, 3.0, 0.0, false, 0);
+    TEST_SPECIAL(0.0, nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, -nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, nl::max(), 0.0, false, 0);
+    TEST_SPECIAL(0.0, -nl::max(), 0.0, false, 0);
+
+    // fmod (-0, y) == -0 for y != 0.
+    TEST_SPECIAL(neg_zero, 3.0, neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, -nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, -nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, nl::max(), neg_zero, false, 0);
+    TEST_SPECIAL(neg_zero, -nl::max(), neg_zero, false, 0);
+
+    // fmod (+inf, y) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(inf, 3.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, -1.1L, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, nl::denorm_min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, nl::min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, nl::max(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, inf, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(inf, neg_inf, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (-inf, y) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(neg_inf, 3.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, -1.1L, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::denorm_min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::min(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::max(), nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, inf, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_inf, neg_inf, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (x, +0) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(3.0, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(-1.1L, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(0.0, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_zero, 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::denorm_min(), 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::min(), 0.0, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::max(), 0.0, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (x, -0) == nl::quiet_NaN() plus invalid exception.
+    TEST_SPECIAL(3.0, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(-1.1L, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(0.0, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(neg_zero, neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::denorm_min(), neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::min(), neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+    TEST_SPECIAL(nl::max(), neg_zero, nl::quiet_NaN(), true, FE_INVALID);
+
+    // fmod (x, +inf) == x for x not infinite.
+    TEST_SPECIAL(0.0, inf, 0.0, false, 0);
+    TEST_SPECIAL(neg_zero, inf, neg_zero, false, 0);
+    TEST_SPECIAL(nl::denorm_min(), inf, nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::min(), inf, nl::min(), false, 0);
+    TEST_SPECIAL(nl::max(), inf, nl::max(), false, 0);
+    TEST_SPECIAL(3.0, inf, 3.0, false, 0);
+    // fmod (x, -inf) == x for x not infinite.
+    TEST_SPECIAL(0.0, neg_inf, 0.0, false, 0);
+    TEST_SPECIAL(neg_zero, neg_inf, neg_zero, false, 0);
+    TEST_SPECIAL(nl::denorm_min(), neg_inf, nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::min(), neg_inf, nl::min(), false, 0);
+    TEST_SPECIAL(nl::max(), neg_inf, nl::max(), false, 0);
+    TEST_SPECIAL(3.0, neg_inf, 3.0, false, 0);
+
+    TEST_SPECIAL(0.0, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(0.0, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_zero, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_zero, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(1.0, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(1.0, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(inf, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(inf, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_inf, nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(neg_inf, -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(0.0, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(0.0, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(neg_zero, nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(neg_zero, -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(1.0, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(1.0, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(inf, nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(inf, -nl::signaling_NaN(), nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(neg_inf, nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(neg_inf, -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::quiet_NaN(), 0.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), 0.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), neg_zero, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), neg_zero, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), 1.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), 1.0, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), neg_inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), neg_inf, nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::signaling_NaN(), 0.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), 0.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), neg_zero, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), neg_zero, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), 1.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), 1.0, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), inf, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), inf, nl::quiet_NaN(), false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), neg_inf, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), neg_inf, nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::quiet_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(-nl::quiet_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false, 0);
+    TEST_SPECIAL(nl::quiet_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::quiet_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::quiet_NaN(), nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::quiet_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), -nl::quiet_NaN(), nl::quiet_NaN(), false,
+                 FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+    TEST_SPECIAL(nl::signaling_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+    TEST_SPECIAL(-nl::signaling_NaN(), -nl::signaling_NaN(), nl::quiet_NaN(),
+                 false, FE_INVALID);
+
+    TEST_SPECIAL(6.5, 2.25L, 2.0L, false, 0);
+    TEST_SPECIAL(-6.5, 2.25L, -2.0L, false, 0);
+    TEST_SPECIAL(6.5, -2.25L, 2.0L, false, 0);
+    TEST_SPECIAL(-6.5, -2.25L, -2.0L, false, 0);
+
+    TEST_SPECIAL(nl::max(), nl::max(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), -nl::max(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), -nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::max(), -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(-nl::max(), nl::max(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), -nl::max(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), -nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::max(), -nl::denorm_min(), neg_zero, false, 0);
+
+    TEST_SPECIAL(nl::min(), nl::max(), nl::min(), false, 0);
+    TEST_SPECIAL(nl::min(), -nl::max(), nl::min(), false, 0);
+    TEST_SPECIAL(nl::min(), nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::min(), -nl::min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::min(), nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::min(), -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(-nl::min(), nl::max(), -nl::min(), false, 0);
+    TEST_SPECIAL(-nl::min(), -nl::max(), -nl::min(), false, 0);
+    TEST_SPECIAL(-nl::min(), nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::min(), -nl::min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::min(), nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::min(), -nl::denorm_min(), neg_zero, false, 0);
+
+    TEST_SPECIAL(nl::denorm_min(), nl::max(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), -nl::max(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), nl::min(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), -nl::min(), nl::denorm_min(), false, 0);
+    TEST_SPECIAL(nl::denorm_min(), nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(nl::denorm_min(), -nl::denorm_min(), 0.0, false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), nl::max(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), -nl::max(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), nl::min(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), -nl::min(), -nl::denorm_min(), false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), nl::denorm_min(), neg_zero, false, 0);
+    TEST_SPECIAL(-nl::denorm_min(), -nl::denorm_min(), neg_zero, false, 0);
+  }
+
+  void testRegularExtreme(FModFunc f) {
+
+    TEST_REGULAR(0x1p127L, 0x3p-149L, 0x1p-149L);
+    TEST_REGULAR(0x1p127L, -0x3p-149L, 0x1p-149L);
+    TEST_REGULAR(0x1p127L, 0x3p-148L, 0x1p-147L);
+    TEST_REGULAR(0x1p127L, -0x3p-148L, 0x1p-147L);
+    TEST_REGULAR(0x1p127L, 0x3p-126L, 0x1p-125L);
+    TEST_REGULAR(0x1p127L, -0x3p-126L, 0x1p-125L);
+    TEST_REGULAR(-0x1p127L, 0x3p-149L, -0x1p-149L);
+    TEST_REGULAR(-0x1p127L, -0x3p-149L, -0x1p-149L);
+    TEST_REGULAR(-0x1p127L, 0x3p-148L, -0x1p-147L);
+    TEST_REGULAR(-0x1p127L, -0x3p-148L, -0x1p-147L);
+    TEST_REGULAR(-0x1p127L, 0x3p-126L, -0x1p-125L);
+    TEST_REGULAR(-0x1p127L, -0x3p-126L, -0x1p-125L);
+
+    if constexpr (sizeof(T) >= sizeof(double)) {
+      TEST_REGULAR(0x1p1023L, 0x3p-1074L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, -0x3p-1074L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, 0x3p-1073L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, -0x3p-1073L, 0x1p-1073L);
+      TEST_REGULAR(0x1p1023L, 0x3p-1022L, 0x1p-1021L);
+      TEST_REGULAR(0x1p1023L, -0x3p-1022L, 0x1p-1021L);
+      TEST_REGULAR(-0x1p1023L, 0x3p-1074L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, -0x3p-1074L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, 0x3p-1073L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, -0x3p-1073L, -0x1p-1073L);
+      TEST_REGULAR(-0x1p1023L, 0x3p-1022L, -0x1p-1021L);
+      TEST_REGULAR(-0x1p1023L, -0x3p-1022L, -0x1p-1021L);
+    }
+  }
+};
+
+#define LIST_FMOD_TESTS(T, func)                                               \
+  using LlvmLibcFmodTest = FmodTest<T>;                                        \
+  TEST_F(LlvmLibcFmodTest, SpecialNumbers) { testSpecialNumbers(&func); }      \
+  TEST_F(LlvmLibcFmodTest, RegularExtreme) { testRegularExtreme(&func); }
+
+#endif // LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H

diff  --git a/libc/test/src/math/
diff erential_testing/CMakeLists.txt b/libc/test/src/math/
diff erential_testing/CMakeLists.txt
index 208e9979a028c..7eb2fbf26a976 100644
--- a/libc/test/src/math/
diff erential_testing/CMakeLists.txt
+++ b/libc/test/src/math/
diff erential_testing/CMakeLists.txt
@@ -470,3 +470,43 @@ add_
diff _binary(
   COMPILE_OPTIONS
     -fno-builtin
 )
+
+add_
diff _binary(
+  fmodf_
diff 
+  SRCS
+    fmodf_
diff .cpp
+  DEPENDS
+    .single_input_single_output_
diff 
+    libc.src.math.fmodf
+)
+
+add_
diff _binary(
+  fmodf_perf
+  SRCS
+    fmodf_perf.cpp
+  DEPENDS
+    .single_input_single_output_
diff 
+    libc.src.math.fmodf
+  COMPILE_OPTIONS
+    -fno-builtin
+)
+
+add_
diff _binary(
+  fmod_
diff 
+  SRCS
+    fmod_
diff .cpp
+  DEPENDS
+    .single_input_single_output_
diff 
+    libc.src.math.fmod
+)
+
+add_
diff _binary(
+  fmod_perf
+  SRCS
+    fmod_perf.cpp
+  DEPENDS
+    .single_input_single_output_
diff 
+    libc.src.math.fmod
+  COMPILE_OPTIONS
+    -fno-builtin
+)

diff  --git a/libc/test/src/math/
diff erential_testing/fmod_
diff .cpp b/libc/test/src/math/
diff erential_testing/fmod_
diff .cpp
new file mode 100644
index 0000000000000..c20a7c5140f00
--- /dev/null
+++ b/libc/test/src/math/
diff erential_testing/fmod_
diff .cpp
@@ -0,0 +1,15 @@
+//===-- Differential test for fmod ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmod.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_DIFF(double, __llvm_libc::fmod, ::fmod, "fmod_
diff .log")

diff  --git a/libc/test/src/math/
diff erential_testing/fmod_perf.cpp b/libc/test/src/math/
diff erential_testing/fmod_perf.cpp
new file mode 100644
index 0000000000000..37878bee99c3a
--- /dev/null
+++ b/libc/test/src/math/
diff erential_testing/fmod_perf.cpp
@@ -0,0 +1,15 @@
+//===-- Differential test for fmod ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmod.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(double, __llvm_libc::fmod, ::fmod, "fmod_perf.log")

diff  --git a/libc/test/src/math/
diff erential_testing/fmodf_
diff .cpp b/libc/test/src/math/
diff erential_testing/fmodf_
diff .cpp
new file mode 100644
index 0000000000000..634c6399877b0
--- /dev/null
+++ b/libc/test/src/math/
diff erential_testing/fmodf_
diff .cpp
@@ -0,0 +1,16 @@
+//===-- Differential test for fmodf ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodf.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_DIFF(float, __llvm_libc::fmodf, ::fmodf,
+                             "fmodf_
diff .log")

diff  --git a/libc/test/src/math/
diff erential_testing/fmodf_perf.cpp b/libc/test/src/math/
diff erential_testing/fmodf_perf.cpp
new file mode 100644
index 0000000000000..36d0fe56d964a
--- /dev/null
+++ b/libc/test/src/math/
diff erential_testing/fmodf_perf.cpp
@@ -0,0 +1,16 @@
+//===-- Differential test for fmodf ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "BinaryOpSingleOutputDiff.h"
+
+#include "src/math/fmodf.h"
+
+#include <math.h>
+
+BINARY_OP_SINGLE_OUTPUT_PERF(float, __llvm_libc::fmodf, ::fmodf,
+                             "fmodf_perf.log")

diff  --git a/libc/test/src/math/exhaustive/CMakeLists.txt b/libc/test/src/math/exhaustive/CMakeLists.txt
index 34f8a38772570..a59724819ed55 100644
--- a/libc/test/src/math/exhaustive/CMakeLists.txt
+++ b/libc/test/src/math/exhaustive/CMakeLists.txt
@@ -184,3 +184,16 @@ add_fp_unittest(
   LINK_LIBRARIES
     -lpthread
 )
+
+add_fp_unittest(
+  fmod_generic_impl_test
+  NO_RUN_POSTBUILD
+  NEED_MPFR
+  SUITE
+    libc_math_exhaustive_tests
+  SRCS
+    fmod_generic_impl_test.cpp
+  DEPENDS
+    libc.src.__support.FPUtil.fputil
+    libc.src.__support.FPUtil.generic.fmod
+)

diff  --git a/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
new file mode 100644
index 0000000000000..9af7e87ec8f07
--- /dev/null
+++ b/libc/test/src/math/exhaustive/fmod_generic_impl_test.cpp
@@ -0,0 +1,78 @@
+//===-- Utility class to test FMod generic implementation -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/generic/FMod.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/FPMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+#include <array>
+#include <limits>
+
+namespace mpfr = __llvm_libc::testing::mpfr;
+
+template <typename T, bool InverseMultiplication>
+class LlvmLibcFModTest : public __llvm_libc::testing::Test {
+
+  using DivisionHelper = __llvm_libc::cpp::ConditionalType<
+      InverseMultiplication,
+      __llvm_libc::fputil::generic::FModDivisionInvMultHelper<T>,
+      __llvm_libc::fputil::generic::FModDivisionSimpleHelper<T>>;
+
+  static constexpr std::array<T, 11> test_bases = {
+      T(0.0),
+      T(1.0),
+      T(3.0),
+      T(27.0),
+      T(11.0 / 8.0),
+      T(2.764443),
+      T(1.0) - std::numeric_limits<T>::epsilon(),
+      T(1.0) + std::numeric_limits<T>::epsilon(),
+      T(M_PI),
+      T(M_SQRT2),
+      T(M_E)};
+
+public:
+  void testExtensive() {
+    using FMod = __llvm_libc::fputil::generic::FMod<
+        T, __llvm_libc::fputil::generic::FModFastMathWrapper<T>,
+        DivisionHelper>;
+    using nl = std::numeric_limits<T>;
+    int min2 = nl::min_exponent - nl::digits - 5;
+    int max2 = nl::max_exponent + 3;
+    for (T by : test_bases) {
+      for (int iy = min2; iy < max2; iy++) {
+        T y = by * std::ldexp(2, iy);
+        if (y == 0 || !std::isfinite(y))
+          continue;
+        for (T bx : test_bases) {
+          for (int ix = min2; ix < max2; ix++) {
+            T x = bx * std::ldexp(2, ix);
+            if (!std::isfinite(x))
+              continue;
+            T result = FMod::eval(x, y);
+            mpfr::BinaryInput<T> input{x, y};
+            EXPECT_MPFR_MATCH(mpfr::Operation::Fmod, input, result, 0.0);
+          }
+        }
+      }
+    }
+  }
+};
+
+using LlvmLibcFModFloatTest = LlvmLibcFModTest<float, false>;
+TEST_F(LlvmLibcFModFloatTest, ExtensiveTest) { testExtensive(); }
+
+using LlvmLibcFModFloatInvTest = LlvmLibcFModTest<float, true>;
+TEST_F(LlvmLibcFModFloatInvTest, ExtensiveTest) { testExtensive(); }
+
+using LlvmLibcFModDoubleTest = LlvmLibcFModTest<double, false>;
+TEST_F(LlvmLibcFModDoubleTest, ExtensiveTest) { testExtensive(); }
+
+using LlvmLibcFModDoubleInvTest = LlvmLibcFModTest<double, true>;
+TEST_F(LlvmLibcFModDoubleInvTest, ExtensiveTest) { testExtensive(); }

diff  --git a/libc/test/src/math/fmod_test.cpp b/libc/test/src/math/fmod_test.cpp
new file mode 100644
index 0000000000000..03790e4941e19
--- /dev/null
+++ b/libc/test/src/math/fmod_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmod ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmod.h"
+
+LIST_FMOD_TESTS(double, __llvm_libc::fmod)

diff  --git a/libc/test/src/math/fmodf_test.cpp b/libc/test/src/math/fmodf_test.cpp
new file mode 100644
index 0000000000000..2b13379eba252
--- /dev/null
+++ b/libc/test/src/math/fmodf_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for fmodf -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FModTest.h"
+
+#include "src/math/fmodf.h"
+
+LIST_FMOD_TESTS(float, __llvm_libc::fmodf)

diff  --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 502a15f6c090b..61052b99dfadc 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -247,6 +247,12 @@ class MPFRNumber {
     return result;
   }
 
+  MPFRNumber fmod(const MPFRNumber &b) {
+    MPFRNumber result(*this);
+    mpfr_fmod(result.value, value, b.value, mpfr_rounding);
+    return result;
+  }
+
   MPFRNumber frexp(int &exp) {
     MPFRNumber result(*this);
     mpfr_exp_t resultExp;
@@ -561,6 +567,8 @@ binary_operation_one_output(Operation op, InputType x, InputType y,
   MPFRNumber inputX(x, precision, rounding);
   MPFRNumber inputY(y, precision, rounding);
   switch (op) {
+  case Operation::Fmod:
+    return inputX.fmod(inputY);
   case Operation::Hypot:
     return inputX.hypot(inputY);
   default:

diff  --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 2896973a33c2f..f1caa41ca1d01 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -56,6 +56,7 @@ enum class Operation : int {
   // input and produce a single floating point number of the same type as
   // output.
   BeginBinaryOperationsSingleOutput,
+  Fmod,
   Hypot,
   EndBinaryOperationsSingleOutput,