[libc-commits] [libc] [libc] Provide more fine-grained control of FMA instruction for ARM targets. (PR #130700)
via libc-commits
libc-commits at lists.llvm.org
Mon Mar 10 20:00:30 PDT 2025
https://github.com/lntue updated https://github.com/llvm/llvm-project/pull/130700
>From d8704b4a84883e70b87d982d147166a006b1ff26 Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue.h at gmail.com>
Date: Tue, 11 Mar 2025 02:39:11 +0000
Subject: [PATCH 1/2] [libc] Provide more fine-grained control of FMA
instruction for ARM targets.
---
libc/src/__support/FPUtil/FMA.h | 5 +++
libc/src/__support/FPUtil/double_double.h | 36 ++++++++++++++-----
libc/src/__support/FPUtil/multiply_add.h | 4 +++
.../macros/properties/cpu_features.h | 15 ++++++++
libc/src/math/generic/asinf.cpp | 4 +--
libc/src/math/generic/atan2f.cpp | 4 +--
libc/src/math/generic/atanf.cpp | 4 +--
libc/src/math/generic/cbrt.cpp | 2 +-
libc/src/math/generic/cos.cpp | 4 +--
libc/src/math/generic/cosf.cpp | 4 +--
libc/src/math/generic/cospif.cpp | 4 +--
libc/src/math/generic/exp10f16.cpp | 4 +--
libc/src/math/generic/exp10m1f16.cpp | 4 +--
libc/src/math/generic/exp2.cpp | 4 +--
libc/src/math/generic/exp2m1f16.cpp | 8 ++---
libc/src/math/generic/expm1f.cpp | 10 +++---
libc/src/math/generic/expm1f16.cpp | 4 +--
libc/src/math/generic/fmul.cpp | 2 +-
libc/src/math/generic/hypotf.cpp | 4 +--
libc/src/math/generic/log.cpp | 4 +--
libc/src/math/generic/log10.cpp | 4 +--
libc/src/math/generic/log10f.cpp | 8 ++---
libc/src/math/generic/log10f16.cpp | 6 ++--
libc/src/math/generic/log1p.cpp | 4 +--
libc/src/math/generic/log1pf.cpp | 4 +--
libc/src/math/generic/log2.cpp | 4 +--
libc/src/math/generic/log2f.cpp | 4 +--
libc/src/math/generic/log2f16.cpp | 8 ++---
libc/src/math/generic/logf.cpp | 8 ++---
libc/src/math/generic/logf16.cpp | 8 ++---
libc/src/math/generic/pow.cpp | 4 +--
libc/src/math/generic/powf.cpp | 8 ++---
.../generic/range_reduction_double_common.h | 4 +--
libc/src/math/generic/sin.cpp | 8 ++---
libc/src/math/generic/sincos.cpp | 8 ++---
libc/src/math/generic/sincos_eval.h | 4 +--
libc/src/math/generic/sincosf.cpp | 4 +--
libc/src/math/generic/sincosf_utils.h | 4 +--
libc/src/math/generic/sinf.cpp | 6 ++--
libc/src/math/generic/tan.cpp | 8 ++---
libc/src/math/generic/tanf.cpp | 4 +--
41 files changed, 147 insertions(+), 103 deletions(-)
diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
index 1e40d06dc1462..2cafb4c0974e3 100644
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -24,6 +24,8 @@ LIBC_INLINE OutType fma(InType x, InType y, InType z) {
}
#ifdef LIBC_TARGET_CPU_HAS_FMA
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
template <> LIBC_INLINE float fma(float x, float y, float z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
@@ -31,7 +33,9 @@ template <> LIBC_INLINE float fma(float x, float y, float z) {
return __builtin_fmaf(x, y, z);
#endif
}
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
template <> LIBC_INLINE double fma(double x, double y, double z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
@@ -39,6 +43,7 @@ template <> LIBC_INLINE double fma(double x, double y, double z) {
return __builtin_fma(x, y, z);
#endif
}
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#endif // LIBC_TARGET_CPU_HAS_FMA
} // namespace fputil
diff --git a/libc/src/__support/FPUtil/double_double.h b/libc/src/__support/FPUtil/double_double.h
index b24ffd4aa456f..c27885aadc028 100644
--- a/libc/src/__support/FPUtil/double_double.h
+++ b/libc/src/__support/FPUtil/double_double.h
@@ -100,6 +100,26 @@ LIBC_INLINE NumberPair<T> exact_mult(const NumberPair<T> &as, T a, T b) {
return r;
}
+// The templated exact multiplication needs template version of
+// LIBC_TARGET_CPU_HAS_FMA_* macro to correctly select the implementation.
+// These can be moved to "src/__support/macros/properties/cpu_features.h" if
+// other part of libc needed.
+template <typename T> struct TargetHasFmaInstruction {
+ static constexpr bool VALUE = false;
+};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
+template <> struct TargetHasFmaInstruction<float> {
+ static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+template <> struct TargetHasFmaInstruction<double> {
+ static constexpr bool VALUE = true;
+};
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+
// Note: When FMA instruction is not available, the `exact_mult` function is
// only correct for round-to-nearest mode. See:
// Zimmermann, P., "Note on the Veltkamp/Dekker Algorithms with Directed
@@ -111,15 +131,15 @@ template <typename T = double, size_t SPLIT_B = DefaultSplit<T>::VALUE>
LIBC_INLINE NumberPair<T> exact_mult(T a, T b) {
NumberPair<T> r{0.0, 0.0};
-#ifdef LIBC_TARGET_CPU_HAS_FMA
- r.hi = a * b;
- r.lo = fputil::multiply_add(a, b, -r.hi);
-#else
- // Dekker's Product.
- NumberPair<T> as = split(a);
+ if constexpr (TargetHasFmaInstruction<T>::VALUE) {
+ r.hi = a * b;
+ r.lo = fputil::multiply_add(a, b, -r.hi);
+ } else {
+ // Dekker's Product.
+ NumberPair<T> as = split(a);
- r = exact_mult<T, SPLIT_B>(as, a, b);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+ r = exact_mult<T, SPLIT_B>(as, a, b);
+ }
return r;
}
diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
index ae00e08673d08..8260702e2c9f4 100644
--- a/libc/src/__support/FPUtil/multiply_add.h
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -46,6 +46,7 @@ multiply_add(T x, T y, T z) {
namespace LIBC_NAMESPACE_DECL {
namespace fputil {
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
LIBC_INLINE float multiply_add(float x, float y, float z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
@@ -53,7 +54,9 @@ LIBC_INLINE float multiply_add(float x, float y, float z) {
return __builtin_fmaf(x, y, z);
#endif
}
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
LIBC_INLINE double multiply_add(double x, double y, double z) {
#if __has_builtin(__builtin_elementwise_fma)
return __builtin_elementwise_fma(x, y, z);
@@ -61,6 +64,7 @@ LIBC_INLINE double multiply_add(double x, double y, double z) {
return __builtin_fma(x, y, z);
#endif
}
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
} // namespace fputil
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index d2cea367516db..1714775ca334d 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -45,6 +45,21 @@
#if defined(__ARM_FEATURE_FMA) || (defined(__AVX2__) && defined(__FMA__)) || \
defined(__NVPTX__) || defined(__AMDGPU__) || defined(__LIBC_RISCV_USE_FMA)
#define LIBC_TARGET_CPU_HAS_FMA
+// Provide a more fine-grained control of FMA instruction for ARM targets.
+#if defined(__ARM_FP)
+#if (__ARM_FP & 0x2)
+#define LIBC_TARGET_CPU_HAS_FMA_HALF
+#endif // LIBC_TARGET_CPU_HAS_FMA_HALF
+#if (__ARM_FP & 0x4)
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#if (__ARM_FP & 0x8)
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#else
+#define LIBC_TARGET_CPU_HAS_FMA_FLOAT
+#define LIBC_TARGET_CPU_HAS_FMA_DOUBLE
+#endif
#endif
#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || \
diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
index 3a89def8f6e0c..b54a9e7b2b00b 100644
--- a/libc/src/math/generic/asinf.cpp
+++ b/libc/src/math/generic/asinf.cpp
@@ -74,12 +74,12 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, 0x1.0p-25f, x);
#else
double xd = static_cast<double>(x);
return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
// Check for exceptional values
diff --git a/libc/src/math/generic/atan2f.cpp b/libc/src/math/generic/atan2f.cpp
index 5ac2b29438ea9..726cae9c8462b 100644
--- a/libc/src/math/generic/atan2f.cpp
+++ b/libc/src/math/generic/atan2f.cpp
@@ -131,7 +131,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
num_r = num_d;
den_r = den_d;
}
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
q.lo = fputil::multiply_add(q.hi, -den_r, num_r) / den_r;
#else
// Compute `(num_r - q.hi * den_r) / den_r` accurately without FMA
@@ -140,7 +140,7 @@ float atan2f_double_double(double num_d, double den_d, double q_d, int idx,
double t1 = fputil::multiply_add(q_hi_dd.hi, -den_r, num_r); // Exact
double t2 = fputil::multiply_add(q_hi_dd.lo, -den_r, t1);
q.lo = t2 / den_r;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// Taylor polynomial, evaluating using Horner's scheme:
// P = x - x^3/3 + x^5/5 -x^7/7 + x^9/9 - x^11/11 + x^13/13 - x^15/15
diff --git a/libc/src/math/generic/atanf.cpp b/libc/src/math/generic/atanf.cpp
index 5e0788efbeb88..46196dbe4162c 100644
--- a/libc/src/math/generic/atanf.cpp
+++ b/libc/src/math/generic/atanf.cpp
@@ -52,12 +52,12 @@ LLVM_LIBC_FUNCTION(float, atanf, (float x)) {
return x;
// x <= 2^-12;
if (LIBC_UNLIKELY(x_abs < 0x3980'0000)) {
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, -0x1.0p-25f, x);
#else
double x_d = static_cast<double>(x);
return static_cast<float>(fputil::multiply_add(x_d, -0x1.0p-25, x_d));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
// Use Taylor polynomial:
// atan(x) ~ x * (1 - x^2 / 3 + x^4 / 5 - x^6 / 7 + x^8 / 9 - x^10 / 11).
diff --git a/libc/src/math/generic/cbrt.cpp b/libc/src/math/generic/cbrt.cpp
index ee7d69b2c211f..ce227e6650c84 100644
--- a/libc/src/math/generic/cbrt.cpp
+++ b/libc/src/math/generic/cbrt.cpp
@@ -58,7 +58,7 @@ double intial_approximation(double x) {
// Get the error term for Newton iteration:
// h(x) = x^3 * a^2 - 1,
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double get_error(const DoubleDouble &x_3, const DoubleDouble &a_sq) {
return fputil::multiply_add(x_3.hi, a_sq.hi, -1.0) +
fputil::multiply_add(x_3.lo, a_sq.hi, x_3.hi * a_sq.lo);
diff --git a/libc/src/math/generic/cos.cpp b/libc/src/math/generic/cos.cpp
index 568b1254c6f02..b60082bf9c308 100644
--- a/libc/src/math/generic/cos.cpp
+++ b/libc/src/math/generic/cos.cpp
@@ -20,11 +20,11 @@
#include "src/math/generic/range_reduction_double_common.h"
#include "src/math/generic/sincos_eval.h"
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#include "range_reduction_double_fma.h"
#else
#include "range_reduction_double_nofma.h"
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp
index 972ffa923aedf..23e3db067e669 100644
--- a/libc/src/math/generic/cosf.cpp
+++ b/libc/src/math/generic/cosf.cpp
@@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
#else
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
if (auto r = COSF_EXCEPTS.lookup(x_abs); LIBC_UNLIKELY(r.has_value()))
diff --git a/libc/src/math/generic/cospif.cpp b/libc/src/math/generic/cospif.cpp
index 4ef1539539921..29566f4fceacf 100644
--- a/libc/src/math/generic/cospif.cpp
+++ b/libc/src/math/generic/cospif.cpp
@@ -50,11 +50,11 @@ LLVM_LIBC_FUNCTION(float, cospif, (float x)) {
// The exhautive test passes for smaller values
if (LIBC_UNLIKELY(x_abs < 0x38A2'F984U)) {
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(xbits.get_val(), -0x1.0p-25f, 1.0f);
#else
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
// Numbers greater or equal to 2^23 are always integers or NaN
diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
index 006dd5c554428..f2002e9f146c0 100644
--- a/libc/src/math/generic/exp10f16.cpp
+++ b/libc/src/math/generic/exp10f16.cpp
@@ -26,7 +26,7 @@
namespace LIBC_NAMESPACE_DECL {
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP10F16_EXCEPTS = 5;
#else
static constexpr size_t N_EXP10F16_EXCEPTS = 8;
@@ -44,7 +44,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
{0xbf0aU, 0x2473U, 1U, 0U, 0U},
// x = -0x1.e1cp+1, exp10f16(x) = 0x1.694p-13 (RZ)
{0xc387U, 0x09a5U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.0cp+1, exp10f16(x) = 0x1.f04p+6 (RZ)
{0x4030U, 0x57c1U, 1U, 0U, 1U},
// x = 0x1.1b8p+1, exp10f16(x) = 0x1.47cp+7 (RZ)
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
index 449aedf254ca5..41e2c2bb14b04 100644
--- a/libc/src/math/generic/exp10m1f16.cpp
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -34,7 +34,7 @@ static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
{0x9788U, 0x9c53U, 0U, 1U, 0U},
}};
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
#else
static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
@@ -49,7 +49,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
{0x3657U, 0x3df6U, 1U, 0U, 0U},
// x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
{0x3741U, 0x3f5cU, 1U, 0U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
{0x4030U, 0x57b1U, 1U, 0U, 1U},
// x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 2c612777c9cb5..726f88b6457fc 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -35,11 +35,11 @@ using LIBC_NAMESPACE::operator""_u128;
// Error bounds:
// Errors when using double precision.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
constexpr double ERR_D = 0x1.0p-63;
#else
constexpr double ERR_D = 0x1.8p-63;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
// Errors when using double-double precision.
diff --git a/libc/src/math/generic/exp2m1f16.cpp b/libc/src/math/generic/exp2m1f16.cpp
index 6a1cd2328a050..eceb76f1893e2 100644
--- a/libc/src/math/generic/exp2m1f16.cpp
+++ b/libc/src/math/generic/exp2m1f16.cpp
@@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues<float16, 6> EXP2M1F16_EXCEPTS_LO = {{
{0x973fU, 0x9505U, 0U, 1U, 0U},
}};
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 6;
#else
static constexpr size_t N_EXP2M1F16_EXCEPTS_HI = 7;
@@ -51,13 +51,13 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
// (input, RZ output, RU offset, RD offset, RN offset)
// x = 0x1.e58p-3, exp2m1f16(x) = 0x1.6dcp-3 (RZ)
{0x3396U, 0x31b7U, 1U, 0U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.2e8p-2, exp2m1f16(x) = 0x1.d14p-3 (RZ)
{0x34baU, 0x3345U, 1U, 0U, 0U},
#endif
// x = 0x1.ad8p-2, exp2m1f16(x) = 0x1.598p-2 (RZ)
{0x36b6U, 0x3566U, 1U, 0U, 0U},
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.edcp-2, exp2m1f16(x) = 0x1.964p-2 (RZ)
{0x37b7U, 0x3659U, 1U, 0U, 1U},
#endif
@@ -67,7 +67,7 @@ static constexpr fputil::ExceptValues<float16, N_EXP2M1F16_EXCEPTS_HI>
{0xb3ccU, 0xb0f9U, 0U, 1U, 0U},
// x = -0x1.294p-1, exp2m1f16(x) = -0x1.53p-2 (RZ)
{0xb8a5U, 0xb54cU, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = -0x1.a34p-1, exp2m1f16(x) = -0x1.bb4p-2 (RZ)
{0xba8dU, 0xb6edU, 0U, 1U, 1U},
#endif
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index d5e9e85ed4bd3..655f0e6246676 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -38,14 +38,14 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
return 0x1.8dbe62p-3f;
}
-#if !defined(LIBC_TARGET_CPU_HAS_FMA)
+#if !defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
if (LIBC_UNLIKELY(x_u == 0xbdc1'c6cbU)) { // x = -0x1.838d96p-4f
int round_mode = fputil::quick_get_round();
if (round_mode == FE_TONEAREST || round_mode == FE_DOWNWARD)
return -0x1.71c884p-4f;
return -0x1.71c882p-4f;
}
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// When |x| > 25*log(2), or nan
if (LIBC_UNLIKELY(x_abs >= 0x418a'a123U)) {
@@ -102,12 +102,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
// 2^-76. For targets without FMA instructions, we simply use double for
// intermediate results as it is more efficient than using an emulated
// version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
- return fputil::fma<float>(x, x, x);
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
+ return fputil::multiply_add<float>(x, x, x);
#else
double xd = x;
return static_cast<float>(fputil::multiply_add(xd, xd, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
constexpr double COEFFS[] = {0x1p-1,
diff --git a/libc/src/math/generic/expm1f16.cpp b/libc/src/math/generic/expm1f16.cpp
index 4ce0efd1f461b..bfd263eaa9cb0 100644
--- a/libc/src/math/generic/expm1f16.cpp
+++ b/libc/src/math/generic/expm1f16.cpp
@@ -29,7 +29,7 @@ static constexpr fputil::ExceptValues<float16, 1> EXPM1F16_EXCEPTS_LO = {{
{0x2959U, 0x2975U, 1U, 0U, 1U},
}};
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 2;
#else
static constexpr size_t N_EXPM1F16_EXCEPTS_HI = 3;
@@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues<float16, N_EXPM1F16_EXCEPTS_HI>
{0x3f0dU, 0x44d3U, 1U, 0U, 1U},
// x = -0x1.e28p-3, expm1f16(x) = -0x1.adcp-3 (RZ)
{0xb38aU, 0xb2b7U, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.a08p-3, exp10m1f(x) = 0x1.cdcp-3 (RZ)
{0x3282U, 0x3337U, 1U, 0U, 0U},
#endif
diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp
index e759e48cd6989..daad64873f27a 100644
--- a/libc/src/math/generic/fmul.cpp
+++ b/libc/src/math/generic/fmul.cpp
@@ -21,7 +21,7 @@ LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
// correctly rounded for all rounding modes, so we fall
// back to the generic `fmul` implementation
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
return fputil::generic::mul<float>(x, y);
#else
fputil::DoubleDouble prod = fputil::exact_mult(x, y);
diff --git a/libc/src/math/generic/hypotf.cpp b/libc/src/math/generic/hypotf.cpp
index 959c0420ae149..ec48f62163a48 100644
--- a/libc/src/math/generic/hypotf.cpp
+++ b/libc/src/math/generic/hypotf.cpp
@@ -55,7 +55,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
// These squares are exact.
double a_sq = ad * ad;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double sum_sq = fputil::multiply_add(bd, bd, a_sq);
#else
double b_sq = bd * bd;
@@ -72,7 +72,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
double r_d = result.get_val();
// Perform rounding correction.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double sum_sq_lo = fputil::multiply_add(bd, bd, a_sq - sum_sq);
double err = sum_sq_lo - fputil::multiply_add(r_d, r_d, -sum_sq);
#else
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index 04eebab975cd5..0cd4424ee0baf 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -800,13 +800,13 @@ LLVM_LIBC_FUNCTION(double, log, (double x)) {
fputil::DoubleDouble r1;
// Perform exact range reduction
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
u = fputil::multiply_add(r, m, -1.0); // exact
#else
uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL;
double c = FPBits_t(c_m).get_val();
u = fputil::multiply_add(r, m - c, CD[index]); // exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// Exact sum:
// r1.hi + r1.lo = e_x * log(2)_hi - log(r)_hi + u
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index fd8d5a8aae938..1c4e559ba083c 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -802,13 +802,13 @@ LLVM_LIBC_FUNCTION(double, log10, (double x)) {
fputil::DoubleDouble r1;
// Perform exact range reduction
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
u = fputil::multiply_add(r, m, -1.0); // exact
#else
uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL;
double c = FPBits_t(c_m).get_val();
u = fputil::multiply_add(r, m - c, CD[index]); // exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// Error of u_sq = ulp(u^2);
u_sq = u * u;
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index c635fa4ef9b63..73ca26374e4a3 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -145,7 +145,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
return fputil::round_result_slightly_up(-0x1.dd2c6ep-5f);
case 0x3f80'70d8U: // x = 0x1.00e1bp0f
return fputil::round_result_slightly_up(0x1.8762c4p-10f);
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
case 0x08ae'a356U: // x = 0x1.5d46acp-110f
return fputil::round_result_slightly_up(-0x1.07d3b4p+5f);
case 0x120b'93dcU: // x = 0x1.1727b8p-91f
@@ -156,7 +156,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
return fputil::round_result_slightly_down(0x1.2c9314p+3f);
case 0x7956'ba5eU: // x = 69683218960000541503257137270226944.0
return fputil::round_result_slightly_up(0x1.16bebap+5f);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
}
}
@@ -194,12 +194,12 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
float u = xbits.get_val();
double v;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
v = static_cast<double>(fputil::multiply_add(u, R[index], -1.0f)); // Exact.
#else
v = fputil::multiply_add(static_cast<double>(u),
static_cast<double>(R[index]), -1.0); // Exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
// Degree-5 polynomial approximation of log10 generated by:
// > P = fpminimax(log10(1 + x)/x, 4, [|D...|], [-2^-8, 2^-7]);
diff --git a/libc/src/math/generic/log10f16.cpp b/libc/src/math/generic/log10f16.cpp
index 990bcabaf6871..c7cb99e1d4691 100644
--- a/libc/src/math/generic/log10f16.cpp
+++ b/libc/src/math/generic/log10f16.cpp
@@ -23,7 +23,7 @@
namespace LIBC_NAMESPACE_DECL {
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_LOG10F16_EXCEPTS = 11;
#else
static constexpr size_t N_LOG10F16_EXCEPTS = 17;
@@ -36,7 +36,7 @@ static constexpr fputil::ExceptValues<float16, N_LOG10F16_EXCEPTS>
{0x338fU, 0xb903U, 0U, 1U, 0U},
// x = 0x1.fep-3, log10f16(x) = -0x1.35p-1 (RZ)
{0x33f8U, 0xb8d4U, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.394p-1, log10f16(x) = -0x1.b4cp-3 (RZ)
{0x38e5U, 0xb2d3U, 0U, 1U, 1U},
#endif
@@ -47,7 +47,7 @@ static constexpr fputil::ExceptValues<float16, N_LOG10F16_EXCEPTS>
// x = 0x1.f3p-1, log10f16(x) = -0x1.6dcp-7 (RZ)
{0x3bccU, 0xa1b7U, 0U, 1U, 1U},
// x = 0x1.f38p-1, log10f16(x) = -0x1.5f8p-7 (RZ)
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
{0x3bceU, 0xa17eU, 0U, 1U, 1U},
// x = 0x1.fd8p-1, log10f16(x) = -0x1.168p-9 (RZ)
{0x3bf6U, 0x985aU, 0U, 1U, 1U},
diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp
index b1f02164b6a28..058409fed081d 100644
--- a/libc/src/math/generic/log1p.cpp
+++ b/libc/src/math/generic/log1p.cpp
@@ -1009,7 +1009,7 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
fputil::DoubleDouble v_lo = fputil::exact_mult(m_dd.lo, r);
// Perform exact range reduction
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
v_hi = fputil::multiply_add(r, m_dd.hi, -1.0); // Exact.
#else
// c = 1 + idx * 2^-7.
@@ -1017,7 +1017,7 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
uint64_t(0x3FF0'0000'0000'0000ULL))
.get_val();
v_hi = fputil::multiply_add(r, m_dd.hi - c, RCM1[idx]); // Exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// Range reduction output:
// -0x1.69000000000edp-8 < v_hi + v_lo < 0x1.7f00000000081p-8
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 869cb077cc434..442b00144104b 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -108,7 +108,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) {
fputil::set_errno_if_required(ERANGE);
fputil::raise_except_if_required(FE_DIVBYZERO);
return FPBits::inf(Sign::NEG).get_val();
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
case 0x4cc1c80bU: // x = 0x1.839016p+26f
return fputil::round_result_slightly_down(0x1.26fc04p+4f);
case 0x5ee8984eU: // x = 0x1.d1309cp+62f
@@ -117,7 +117,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) {
return fputil::round_result_slightly_up(0x1.af66cp+5f);
case 0x79e7ec37U: // x = 0x1.cfd86ep+116f
return fputil::round_result_slightly_up(0x1.43ff6ep+6f);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
}
return internal::log(xd + 1.0);
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index f46ff724a4f37..27ca2fc350f17 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -915,13 +915,13 @@ LLVM_LIBC_FUNCTION(double, log2, (double x)) {
fputil::DoubleDouble r1;
// Perform exact range reduction
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
u = fputil::multiply_add(r, m, -1.0); // exact
#else
uint64_t c_m = x_m & 0x3FFF'E000'0000'0000ULL;
double c = FPBits_t(c_m).get_val();
u = fputil::multiply_add(r, m - c, CD[index]); // exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// Exact sum:
// r1.hi + r1.lo = e_x * log(2)_hi - log(r)_hi + u
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index 111f3f130bcab..b25ec41f277b6 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -97,11 +97,11 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
float u = xbits.get_val();
double v;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
v = static_cast<double>(fputil::multiply_add(u, R[index], -1.0f)); // Exact.
#else
v = fputil::multiply_add(static_cast<double>(u), RD[index], -1.0); // Exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
double extra_factor = static_cast<double>(m) + LOG2_R[index];
diff --git a/libc/src/math/generic/log2f16.cpp b/libc/src/math/generic/log2f16.cpp
index ff4e0268b53d0..70d592c1976d7 100644
--- a/libc/src/math/generic/log2f16.cpp
+++ b/libc/src/math/generic/log2f16.cpp
@@ -23,7 +23,7 @@
namespace LIBC_NAMESPACE_DECL {
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_LOG2F16_EXCEPTS = 2;
#else
static constexpr size_t N_LOG2F16_EXCEPTS = 9;
@@ -32,7 +32,7 @@ static constexpr size_t N_LOG2F16_EXCEPTS = 9;
static constexpr fputil::ExceptValues<float16, N_LOG2F16_EXCEPTS>
LOG2F16_EXCEPTS = {{
// (input, RZ output, RU offset, RD offset, RN offset)
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.224p-1, log2f16(x) = -0x1.a34p-1 (RZ)
{0x3889U, 0xba8dU, 0U, 1U, 0U},
// x = 0x1.e34p-1, log2f16(x) = -0x1.558p-4 (RZ)
@@ -40,7 +40,7 @@ static constexpr fputil::ExceptValues<float16, N_LOG2F16_EXCEPTS>
#endif
// x = 0x1.e8cp-1, log2f16(x) = -0x1.128p-4 (RZ)
{0x3ba3U, 0xac4aU, 0U, 1U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.f98p-1, log2f16(x) = -0x1.2ep-6 (RZ)
{0x3be6U, 0xa4b8U, 0U, 1U, 0U},
// x = 0x1.facp-1, log2f16(x) = -0x1.e7p-7 (RZ)
@@ -48,7 +48,7 @@ static constexpr fputil::ExceptValues<float16, N_LOG2F16_EXCEPTS>
#endif
// x = 0x1.fb4p-1, log2f16(x) = -0x1.b88p-7 (RZ)
{0x3bedU, 0xa2e2U, 0U, 1U, 1U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.fecp-1, log2f16(x) = -0x1.cep-9 (RZ)
{0x3bfbU, 0x9b38U, 0U, 1U, 1U},
// x = 0x1.ffcp-1, log2f16(x) = -0x1.714p-11 (RZ)
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 30c00edafe21d..9ed44cdc04226 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -104,14 +104,14 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
return round_result_slightly_down(0x1.08b512p+6f);
case 0x7a17f30aU: // x = 0x1.2fe614p+117f
return round_result_slightly_up(0x1.451436p+6f);
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
case 0x500ffb03U: // x = 0x1.1ff606p+33f
return round_result_slightly_up(0x1.6fdd34p+4f);
case 0x5cd69e88U: // x = 0x1.ad3d1p+58f
return round_result_slightly_up(0x1.45c146p+5f);
case 0x5ee8984eU: // x = 0x1.d1309cp+62f;
return round_result_slightly_up(0x1.5c9442p+5f);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
}
// Exceptional inputs.
if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) {
@@ -152,11 +152,11 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
float u = xbits.get_val();
double v;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
v = static_cast<double>(fputil::multiply_add(u, R[index], -1.0f)); // Exact.
#else
v = fputil::multiply_add(static_cast<double>(u), RD[index], -1.0); // Exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
// Degree-5 polynomial approximation of log generated by Sollya with:
// > P = fpminimax(log(1 + x)/x, 4, [|1, D...|], [-2^-8, 2^-7]);
diff --git a/libc/src/math/generic/logf16.cpp b/libc/src/math/generic/logf16.cpp
index 802225a810550..dd08e34270eef 100644
--- a/libc/src/math/generic/logf16.cpp
+++ b/libc/src/math/generic/logf16.cpp
@@ -23,7 +23,7 @@
namespace LIBC_NAMESPACE_DECL {
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
static constexpr size_t N_LOGF16_EXCEPTS = 5;
#else
static constexpr size_t N_LOGF16_EXCEPTS = 11;
@@ -32,7 +32,7 @@ static constexpr size_t N_LOGF16_EXCEPTS = 11;
static constexpr fputil::ExceptValues<float16, N_LOGF16_EXCEPTS>
LOGF16_EXCEPTS = {{
// (input, RZ output, RU offset, RD offset, RN offset)
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.61cp-13, logf16(x) = -0x1.16p+3 (RZ)
{0x0987U, 0xc858U, 0U, 1U, 0U},
// x = 0x1.f2p-12, logf16(x) = -0x1.e98p+2 (RZ)
@@ -42,7 +42,7 @@ static constexpr fputil::ExceptValues<float16, N_LOGF16_EXCEPTS>
{0x1935U, 0xc5f9U, 0U, 1U, 0U},
// x = 0x1.5ep-8, logf16(x) = -0x1.4ecp+2 (RZ)
{0x1d78U, 0xc53bU, 0U, 1U, 0U},
-#ifndef LIBC_TARGET_CPU_HAS_FMA
+#ifndef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.fdp-1, logf16(x) = -0x1.81p-8 (RZ)
{0x3bf4U, 0x9e04U, 0U, 1U, 1U},
// x = 0x1.fep-1, logf16(x) = -0x1.008p-8 (RZ)
@@ -52,7 +52,7 @@ static constexpr fputil::ExceptValues<float16, N_LOGF16_EXCEPTS>
{0x3bfcU, 0x9801U, 0U, 1U, 0U},
// x = 0x1.ff8p-1, logf16(x) = -0x1p-10 (RZ)
{0x3bfeU, 0x9400U, 0U, 1U, 1U},
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
// x = 0x1.4c4p+1, logf16(x) = 0x1.e84p-1 (RZ)
{0x4131U, 0x3ba1U, 1U, 0U, 1U},
#else
diff --git a/libc/src/math/generic/pow.cpp b/libc/src/math/generic/pow.cpp
index a2a0bb698f81a..8a12934f6c4ba 100644
--- a/libc/src/math/generic/pow.cpp
+++ b/libc/src/math/generic/pow.cpp
@@ -394,14 +394,14 @@ LLVM_LIBC_FUNCTION(double, pow, (double x, double y)) {
DoubleDouble dx_c0;
// Perform exact range reduction and exact product dx * c0.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
dx = fputil::multiply_add(RD[idx_x], m_x.get_val(), -1.0); // Exact
dx_c0 = fputil::exact_mult(COEFFS[0], dx);
#else
double c = FPBits(m_x.uintval() & 0x3fff'e000'0000'0000).get_val();
dx = fputil::multiply_add(RD[idx_x], m_x.get_val() - c, CD[idx_x]); // Exact
dx_c0 = fputil::exact_mult<double, 28>(dx, COEFFS[0]); // Exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
double dx2 = dx * dx;
double c0 = fputil::multiply_add(dx, COEFFS[2], COEFFS[1]);
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index 7f4417d275702..2d7deca3c77bb 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -165,11 +165,11 @@ alignas(16) constexpr DoubleDouble LOG2_R_DD[128] = {
};
#else
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
constexpr uint64_t ERR = 64;
#else
constexpr uint64_t ERR = 128;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// We choose the precision of the high part to be 53 - 24 - 8, so that when
// y * (e_x + LOG2_R_DD[i].hi) is exact.
@@ -851,11 +851,11 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
// log2(m_x) = log2( (1 + dx) / r )
// = log2(1 + dx) - log2(r).
double dx;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_FLOAT
dx = static_cast<double>(fputil::multiply_add(m_x, R[idx_x], -1.0f)); // Exact
#else
dx = fputil::multiply_add(static_cast<double>(m_x), RD[idx_x], -1.0); // Exact
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
// Degree-5 polynomial approximation:
// dx * P(dx) ~ log2(1 + dx)
diff --git a/libc/src/math/generic/range_reduction_double_common.h b/libc/src/math/generic/range_reduction_double_common.h
index 711a12219c847..f3dcdb937333c 100644
--- a/libc/src/math/generic/range_reduction_double_common.h
+++ b/libc/src/math/generic/range_reduction_double_common.h
@@ -20,14 +20,14 @@
namespace LIBC_NAMESPACE_DECL {
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
static constexpr unsigned SPLIT = fputil::DefaultSplit<double>::VALUE;
#else
// When there is no-FMA instructions, in order to have exact product of 2 double
// precision with directional roundings, we need to lower the precision of the
// constants by at least 1 bit, and use a different splitting constant.
static constexpr unsigned SPLIT = 28;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
using LIBC_NAMESPACE::fputil::DoubleDouble;
using Float128 = LIBC_NAMESPACE::fputil::DyadicFloat<128>;
diff --git a/libc/src/math/generic/sin.cpp b/libc/src/math/generic/sin.cpp
index b32486dff487c..4a58dcf4b173f 100644
--- a/libc/src/math/generic/sin.cpp
+++ b/libc/src/math/generic/sin.cpp
@@ -21,11 +21,11 @@
#include "src/math/generic/range_reduction_double_common.h"
#include "src/math/generic/sincos_eval.h"
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#include "range_reduction_double_fma.h"
#else
#include "range_reduction_double_nofma.h"
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
namespace LIBC_NAMESPACE_DECL {
@@ -52,7 +52,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) {
if (LIBC_UNLIKELY(x == 0.0))
return x + x; // Make sure it works with FTZ/DAZ.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
return fputil::multiply_add(x, -0x1.0p-54, x);
#else
if (LIBC_UNLIKELY(x_e < 4)) {
@@ -63,7 +63,7 @@ LLVM_LIBC_FUNCTION(double, sin, (double x)) {
return FPBits(xbits.uintval() - 1).get_val();
}
return fputil::multiply_add(x, -0x1.0p-54, x);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
}
// No range reduction needed.
k = 0;
diff --git a/libc/src/math/generic/sincos.cpp b/libc/src/math/generic/sincos.cpp
index 166ce46603140..0ac2f7f997527 100644
--- a/libc/src/math/generic/sincos.cpp
+++ b/libc/src/math/generic/sincos.cpp
@@ -22,11 +22,11 @@
#include "src/math/generic/range_reduction_double_common.h"
#include "src/math/generic/sincos_eval.h"
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#include "range_reduction_double_fma.h"
#else
#include "range_reduction_double_nofma.h"
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
namespace LIBC_NAMESPACE_DECL {
@@ -57,7 +57,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
}
// For |x| < 2^-27, max(|sin(x) - x|, |cos(x) - 1|) < ulp(x)/2.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
*sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
*cos_x = fputil::multiply_add(x, -x, 1.0);
#else
@@ -71,7 +71,7 @@ LLVM_LIBC_FUNCTION(void, sincos, (double x, double *sin_x, double *cos_x)) {
*sin_x = FPBits(xbits.uintval() - 1).get_val();
}
*sin_x = fputil::multiply_add(x, -0x1.0p-54, x);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
return;
}
// No range reduction needed.
diff --git a/libc/src/math/generic/sincos_eval.h b/libc/src/math/generic/sincos_eval.h
index 6cd1da4721bf5..41a4c75849ff4 100644
--- a/libc/src/math/generic/sincos_eval.h
+++ b/libc/src/math/generic/sincos_eval.h
@@ -65,7 +65,7 @@ LIBC_INLINE double sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u,
double u_hi_neg_half = (-0.5) * u.hi;
DoubleDouble v;
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
v.hi = fputil::multiply_add(u.hi, u_hi_neg_half, 1.0);
v.lo = 1.0 - v.hi; // Exact
v.lo = fputil::multiply_add(u.hi, u_hi_neg_half, v.lo);
@@ -73,7 +73,7 @@ LIBC_INLINE double sincos_eval(const DoubleDouble &u, DoubleDouble &sin_u,
DoubleDouble u_hi_sq_neg_half = fputil::exact_mult(u.hi, u_hi_neg_half);
v = fputil::exact_add(1.0, u_hi_sq_neg_half.hi);
v.lo += u_hi_sq_neg_half.lo;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
// r1 ~ -1/720 + u_hi^2 / 40320
double r1 = fputil::multiply_add(u_hi_sq, 0x1.a01a01a01a01ap-16,
diff --git a/libc/src/math/generic/sincosf.cpp b/libc/src/math/generic/sincosf.cpp
index ccaa29c10c4c6..898c8bd0f0ae9 100644
--- a/libc/src/math/generic/sincosf.cpp
+++ b/libc/src/math/generic/sincosf.cpp
@@ -130,14 +130,14 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
*sinp = fputil::multiply_add(x, -0x1.0p-25f, x);
*cosp = fputil::multiply_add(FPBits(x_abs).get_val(), -0x1.0p-25f, 1.0f);
#else
*sinp = static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, xd));
*cosp = static_cast<float>(fputil::multiply_add(
static_cast<double>(FPBits(x_abs).get_val()), -0x1.0p-25, 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
return;
}
diff --git a/libc/src/math/generic/sincosf_utils.h b/libc/src/math/generic/sincosf_utils.h
index 726a5ab9b64be..45abbe6aa5a83 100644
--- a/libc/src/math/generic/sincosf_utils.h
+++ b/libc/src/math/generic/sincosf_utils.h
@@ -14,7 +14,7 @@
#include "src/__support/macros/config.h"
#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
#include "range_reduction_fma.h"
// using namespace LIBC_NAMESPACE::fma;
using LIBC_NAMESPACE::fma::FAST_PASS_BOUND;
@@ -27,7 +27,7 @@ using LIBC_NAMESPACE::fma::small_range_reduction;
using LIBC_NAMESPACE::generic::FAST_PASS_BOUND;
using LIBC_NAMESPACE::generic::large_range_reduction;
using LIBC_NAMESPACE::generic::small_range_reduction;
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp
index cea267d4c683e..da188e5df557e 100644
--- a/libc/src/math/generic/sinf.cpp
+++ b/libc/src/math/generic/sinf.cpp
@@ -19,7 +19,7 @@
#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_DOUBLE)
#include "range_reduction_fma.h"
#else
#include "range_reduction.h"
@@ -101,11 +101,11 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, -0x1.0p-25f, x);
#else
return static_cast<float>(fputil::multiply_add(xd, -0x1.0p-25, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
// |x| < pi/16.
diff --git a/libc/src/math/generic/tan.cpp b/libc/src/math/generic/tan.cpp
index 19d31a8441efb..a899a2128d384 100644
--- a/libc/src/math/generic/tan.cpp
+++ b/libc/src/math/generic/tan.cpp
@@ -22,11 +22,11 @@
#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
#include "src/math/generic/range_reduction_double_common.h"
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
#include "range_reduction_double_fma.h"
#else
#include "range_reduction_double_nofma.h"
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
namespace LIBC_NAMESPACE_DECL {
@@ -140,7 +140,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
if (LIBC_UNLIKELY(x == 0.0))
return x + x; // Make sure it works with FTZ/DAZ.
-#ifdef LIBC_TARGET_CPU_HAS_FMA
+#ifdef LIBC_TARGET_CPU_HAS_FMA_DOUBLE
return fputil::multiply_add(x, 0x1.0p-54, x);
#else
if (LIBC_UNLIKELY(x_e < 4)) {
@@ -150,7 +150,7 @@ LLVM_LIBC_FUNCTION(double, tan, (double x)) {
return FPBits(xbits.uintval() + 1).get_val();
}
return fputil::multiply_add(x, 0x1.0p-54, x);
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_DOUBLE
}
// No range reduction needed.
k = 0;
diff --git a/libc/src/math/generic/tanf.cpp b/libc/src/math/generic/tanf.cpp
index 6fd5f9a103676..f4f7e08838d81 100644
--- a/libc/src/math/generic/tanf.cpp
+++ b/libc/src/math/generic/tanf.cpp
@@ -74,11 +74,11 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
// |x| < 2^-125. For targets without FMA instructions, we simply use
// double for intermediate results as it is more efficient than using an
// emulated version of FMA.
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
+#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
return fputil::multiply_add(x, 0x1.0p-25f, x);
#else
return static_cast<float>(fputil::multiply_add(xd, 0x1.0p-25, xd));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+#endif // LIBC_TARGET_CPU_HAS_FMA_FLOAT
}
// |x| < pi/32
>From 7f5f01603c132b38d493cafb89c564416d29299e Mon Sep 17 00:00:00 2001
From: Tue Ly <lntue.h at gmail.com>
Date: Tue, 11 Mar 2025 02:59:43 +0000
Subject: [PATCH 2/2] Fix few multiply_add calls with templated versions.
---
libc/src/math/generic/expm1f.cpp | 2 +-
libc/src/math/generic/sincosf16_utils.h | 2 +-
libc/src/math/generic/sincosf_utils.h | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index 655f0e6246676..1e44e943d9258 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -103,7 +103,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
// intermediate results as it is more efficient than using an emulated
// version of FMA.
#if defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
- return fputil::multiply_add<float>(x, x, x);
+ return fputil::multiply_add(x, x, x);
#else
double xd = x;
return static_cast<float>(fputil::multiply_add(xd, xd, xd));
diff --git a/libc/src/math/generic/sincosf16_utils.h b/libc/src/math/generic/sincosf16_utils.h
index 133896b5de7a3..05cab09d2089b 100644
--- a/libc/src/math/generic/sincosf16_utils.h
+++ b/libc/src/math/generic/sincosf16_utils.h
@@ -40,7 +40,7 @@ constexpr float SIN_K_PI_OVER_32[64] = {
LIBC_INLINE int32_t range_reduction_sincospif16(float x, float &y) {
float kf = fputil::nearest_integer(x * 32);
- y = fputil::multiply_add<float>(x, 32.0, -kf);
+ y = fputil::multiply_add(x, 32.0f, -kf);
return static_cast<int32_t>(kf);
}
diff --git a/libc/src/math/generic/sincosf_utils.h b/libc/src/math/generic/sincosf_utils.h
index 45abbe6aa5a83..6eaf820e5c1b0 100644
--- a/libc/src/math/generic/sincosf_utils.h
+++ b/libc/src/math/generic/sincosf_utils.h
@@ -108,7 +108,7 @@ LIBC_INLINE void sincosf_eval(double xd, uint32_t x_abs, double &sin_k,
// => pi * x = (k + y) * pi / 32
static LIBC_INLINE int64_t range_reduction_sincospi(double x, double &y) {
double kd = fputil::nearest_integer(x * 32);
- y = fputil::multiply_add<double>(x, 32.0, -kd);
+ y = fputil::multiply_add(x, 32.0, -kd);
return static_cast<int64_t>(kd);
}
More information about the libc-commits
mailing list