[libc-commits] [libc] c5f8a0a - [libc] Add support for x86-64 targets that do not have FMA instructions.
Tue Ly via libc-commits
libc-commits at lists.llvm.org
Fri Apr 8 11:12:40 PDT 2022
Author: Tue Ly
Date: 2022-04-08T14:12:24-04:00
New Revision: c5f8a0a1e929ea41bd3e0d781c6b394a3f619427
URL: https://github.com/llvm/llvm-project/commit/c5f8a0a1e929ea41bd3e0d781c6b394a3f619427
DIFF: https://github.com/llvm/llvm-project/commit/c5f8a0a1e929ea41bd3e0d781c6b394a3f619427.diff
LOG: [libc] Add support for x86-64 targets that do not have FMA instructions.
Make FMA flag checks more accurate for x86-64 targets, and refactor
polyeval to use multiply and add instead when FMA instructions are not
available.
Reviewed By: michaelrj, sivachandra
Differential Revision: https://reviews.llvm.org/D123335
Added:
libc/src/__support/FPUtil/multiply_add.h
Modified:
libc/src/__support/FPUtil/CMakeLists.txt
libc/src/__support/FPUtil/FMA.h
libc/src/__support/FPUtil/PolyEval.h
libc/src/__support/FPUtil/aarch64/FMA.h
libc/src/__support/FPUtil/generic/CMakeLists.txt
libc/src/__support/FPUtil/generic/FMA.h
libc/src/__support/FPUtil/x86_64/FMA.h
libc/src/__support/architectures.h
libc/src/math/CMakeLists.txt
libc/src/math/generic/CMakeLists.txt
libc/src/math/generic/expm1f.cpp
libc/src/math/generic/log10f.cpp
libc/src/math/generic/log1pf.cpp
libc/src/math/generic/logf.cpp
utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Removed:
################################################################################
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index f1cd0b587d963..7f1cecc25f038 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -12,7 +12,6 @@ add_header_library(
NearestIntegerOperations.h
NormalFloat.h
PlatformDefs.h
- PolyEval.h
UInt.h
XFloat.h
DEPENDS
@@ -34,4 +33,29 @@ add_header_library(
libc.src.__support.FPUtil.generic.sqrt
)
+add_header_library(
+ fma
+ HDRS
+ FMA.h
+ DEPENDS
+ .fputil
+ libc.src.__support.FPUtil.generic.fma
+)
+
+add_header_library(
+ multiply_add
+ HDRS
+ multiply_add.h
+ DEPENDS
+ .fma
+)
+
+add_header_library(
+ polyeval
+ HDRS
+ PolyEval.h
+ DEPENDS
+ .multiply_add
+)
+
add_subdirectory(generic)
diff --git a/libc/src/__support/FPUtil/FMA.h b/libc/src/__support/FPUtil/FMA.h
index c735c069fd4da..6823dd0f897b5 100644
--- a/libc/src/__support/FPUtil/FMA.h
+++ b/libc/src/__support/FPUtil/FMA.h
@@ -11,11 +11,16 @@
#include "src/__support/architectures.h"
+#if defined(LIBC_TARGET_HAS_FMA)
+
#if defined(LLVM_LIBC_ARCH_X86_64)
#include "x86_64/FMA.h"
#elif defined(LLVM_LIBC_ARCH_AARCH64)
#include "aarch64/FMA.h"
+#endif
+
#else
+// FMA instructions are not available
#include "generic/FMA.h"
#include "src/__support/CPP/TypeTraits.h"
diff --git a/libc/src/__support/FPUtil/PolyEval.h b/libc/src/__support/FPUtil/PolyEval.h
index 368ee3848ddc3..c9e818accd1b1 100644
--- a/libc/src/__support/FPUtil/PolyEval.h
+++ b/libc/src/__support/FPUtil/PolyEval.h
@@ -9,19 +9,15 @@
#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
-#include "src/__support/CPP/TypeTraits.h"
-#include "src/__support/architectures.h"
+#include "multiply_add.h"
// Evaluate polynomial using Horner's Scheme:
// With polyeval(x, a_0, a_1, ..., a_n) = a_n * x^n + ... + a_1 * x + a_0, we
// evaluated it as: a_0 + x * (a_1 + x * ( ... (a_(n-1) + x * a_n) ... ) ) ).
-// We will use fma instructions if available.
+// We will use FMA instructions if available.
// Example: to evaluate x^3 + 2*x^2 + 3*x + 4, call
// polyeval( x, 4.0, 3.0, 2.0, 1.0 )
-#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
-#include "FMA.h"
-
namespace __llvm_libc {
namespace fputil {
@@ -29,35 +25,10 @@ template <typename T> static inline T polyeval(T x, T a0) { return a0; }
template <typename T, typename... Ts>
INLINE_FMA static inline T polyeval(T x, T a0, Ts... a) {
- return fma(x, polyeval(x, a...), a0);
+ return multiply_add(x, polyeval(x, a...), a0);
}
} // namespace fputil
} // namespace __llvm_libc
-#ifdef LLVM_LIBC_ARCH_X86_64
-
-// [DISABLED] There is a regression with using vectorized version for polyeval
-// compared to the naive Horner's scheme with fma. Need further investigation
-// #include "x86_64/PolyEval.h"
-
-#endif // LLVM_LIBC_ARCH_X86_64
-
-#else
-
-namespace __llvm_libc {
-namespace fputil {
-
-template <typename T> static inline T polyeval(T x, T a0) { return a0; }
-
-template <typename T, typename... Ts>
-static inline T polyeval(T x, T a0, Ts... a) {
- return x * polyeval(x, a...) + a0;
-}
-
-} // namespace fputil
-} // namespace __llvm_libc
-
-#endif
-
-#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_FMA_H
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_POLYEVAL_H
diff --git a/libc/src/__support/FPUtil/aarch64/FMA.h b/libc/src/__support/FPUtil/aarch64/FMA.h
index c236c9a90b741..ed637c848658e 100644
--- a/libc/src/__support/FPUtil/aarch64/FMA.h
+++ b/libc/src/__support/FPUtil/aarch64/FMA.h
@@ -15,6 +15,10 @@
#error "Invalid include"
#endif
+#if !defined(LIBC_TARGET_HAS_FMA)
+#error "FMA instructions are not supported"
+#endif
+
#include "src/__support/CPP/TypeTraits.h"
namespace __llvm_libc {
diff --git a/libc/src/__support/FPUtil/generic/CMakeLists.txt b/libc/src/__support/FPUtil/generic/CMakeLists.txt
index bf69e7dd961cd..a755e7670ce68 100644
--- a/libc/src/__support/FPUtil/generic/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/generic/CMakeLists.txt
@@ -4,3 +4,9 @@ add_header_library(
sqrt.h
sqrt_80_bit_long_double.h
)
+
+add_header_library(
+ fma
+ HDRS
+ FMA.h
+)
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index efdd8b7129753..78b640c2c1a13 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -10,6 +10,7 @@
#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_GENERIC_FMA_H
#include "src/__support/CPP/TypeTraits.h"
+#include "src/__support/FPUtil/FPBits.h"
namespace __llvm_libc {
namespace fputil {
diff --git a/libc/src/__support/FPUtil/multiply_add.h b/libc/src/__support/FPUtil/multiply_add.h
new file mode 100644
index 0000000000000..8f5da22a53cb1
--- /dev/null
+++ b/libc/src/__support/FPUtil/multiply_add.h
@@ -0,0 +1,41 @@
+//===-- Common header for multiply-add implementations ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
+
+#include "src/__support/architectures.h"
+
+namespace __llvm_libc {
+namespace fputil {
+
+// Implement a simple wrapper for multiply-add operation:
+// multiply_add(x, y, z) = x*y + z
+// which uses FMA instructions to speed up if available.
+
+template <typename T> static inline T multiply_add(T x, T y, T z) {
+ return x * y + z;
+}
+
+#if defined(LIBC_TARGET_HAS_FMA)
+// FMA instructions are available.
+#include "FMA.h"
+
+template <> inline float multiply_add<float>(float x, float y, float z) {
+ return fma(x, y, z);
+}
+
+template <> inline double multiply_add<double>(double x, double y, double z) {
+ return fma(x, y, z);
+}
+#endif // LIBC_TARGET_HAS_FMA
+
+} // namespace fputil
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_MULTIPLY_ADD_H
diff --git a/libc/src/__support/FPUtil/x86_64/FMA.h b/libc/src/__support/FPUtil/x86_64/FMA.h
index 70ebe382e841a..08de6da344107 100644
--- a/libc/src/__support/FPUtil/x86_64/FMA.h
+++ b/libc/src/__support/FPUtil/x86_64/FMA.h
@@ -15,6 +15,10 @@
#error "Invalid include"
#endif
+#if !defined(LIBC_TARGET_HAS_FMA)
+#error "FMA instructions are not supported"
+#endif
+
#include "src/__support/CPP/TypeTraits.h"
#include <immintrin.h>
diff --git a/libc/src/__support/architectures.h b/libc/src/__support/architectures.h
index 14eb1a586463f..70eeb99107b2b 100644
--- a/libc/src/__support/architectures.h
+++ b/libc/src/__support/architectures.h
@@ -37,7 +37,15 @@
#define LLVM_LIBC_ARCH_ANY_ARM
#endif
-#if defined(LLVM_LIBC_ARCH_X86_64)
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+#define LIBC_TARGET_HAS_FMA
+#elif defined(LLVM_LIBC_ARCH_X86_64)
+#if (defined(__AVX2__) || defined(__FMA__))
+#define LIBC_TARGET_HAS_FMA
+#endif
+#endif
+
+#if (defined(LLVM_LIBC_ARCH_X86_64) && defined(LIBC_TARGET_HAS_FMA))
#define INLINE_FMA __attribute__((target("fma")))
#else
#define INLINE_FMA
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 9737f4a362dc0..8fc550d1f9682 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -48,8 +48,9 @@ add_entrypoint_object(
fmaf.h
DEPENDS
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.fma
COMPILE_OPTIONS
- -O2
+ -O3
-mfma
)
@@ -61,8 +62,9 @@ add_entrypoint_object(
fma.h
DEPENDS
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.fma
COMPILE_OPTIONS
- -O2
+ -O3
-mfma
)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 73957cf789104..6a96b55108045 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -478,6 +478,7 @@ add_entrypoint_object(
DEPENDS
.common_constants
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.polyeval
libc.include.math
COMPILE_OPTIONS
-O3
@@ -492,6 +493,7 @@ add_entrypoint_object(
../exp2f.h
DEPENDS
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.polyeval
libc.include.math
COMPILE_OPTIONS
-O3
@@ -507,6 +509,8 @@ add_entrypoint_object(
DEPENDS
.common_constants
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.polyeval
libc.include.math
COMPILE_OPTIONS
-O3
@@ -674,6 +678,8 @@ add_entrypoint_object(
DEPENDS
.common_constants
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.polyeval
COMPILE_OPTIONS
-O3
-mfma
@@ -688,6 +694,8 @@ add_entrypoint_object(
DEPENDS
.common_constants
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.polyeval
COMPILE_OPTIONS
-O3
-mfma
@@ -702,6 +710,7 @@ add_entrypoint_object(
DEPENDS
.common_constants
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.polyeval
COMPILE_OPTIONS
-O3
-mfma
@@ -716,6 +725,8 @@ add_entrypoint_object(
DEPENDS
.common_constants
libc.src.__support.FPUtil.fputil
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.polyeval
COMPILE_OPTIONS
-O3
-mfma
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index b0544b76e09ac..76232d6ab6a95 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -83,7 +83,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
// = x otherwise.
// To simplify the rounding decision and make it more efficient, we use
// fma(x, x, x) ~ x + x^2 instead.
- return fputil::fma(x, x, x);
+ return fputil::multiply_add(x, x, x);
}
// 2^-25 <= |x| < 2^-4
@@ -96,7 +96,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
fputil::polyeval(xd, 0x1p-1, 0x1.55555555557ddp-3, 0x1.55555555552fap-5,
0x1.111110fcd58b7p-7, 0x1.6c16c1717660bp-10,
0x1.a0241f0006d62p-13, 0x1.a01e3f8d3c06p-16);
- return static_cast<float>(fputil::fma(r, xsq, xd));
+ return static_cast<float>(fputil::multiply_add(r, xsq, xd));
}
// For -18 < x < 89, to compute expm1(x), we perform the following range
@@ -132,7 +132,7 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
double exp_lo =
fputil::polyeval(xd, 0x1.0p0, 0x1.ffffffffff777p-1, 0x1.000000000071cp-1,
0x1.555566668e5e7p-3, 0x1.55555555ef243p-5);
- return static_cast<float>(fputil::fma(exp_hi_mid, exp_lo, -1.0));
+ return static_cast<float>(fputil::multiply_add(exp_hi_mid, exp_lo, -1.0));
}
} // namespace __llvm_libc
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index 59ca6590b640a..878ae68f85eb7 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -170,7 +170,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
double d = static_cast<float>(xbits) - static_cast<float>(f);
d *= ONE_OVER_F[f_index];
- double extra_factor = fputil::fma(m, LOG10_2, LOG10_F[f_index]);
+ double extra_factor = fputil::multiply_add(m, LOG10_2, LOG10_F[f_index]);
double r = fputil::polyeval(d, extra_factor, 0x1.bcb7b1526e4c5p-2,
-0x1.bcb7b1518a5e9p-3, 0x1.287a72a6f716p-3,
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 7d1e71e919902..6e8c6781e6ef3 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -66,7 +66,7 @@ INLINE_FMA static inline float log(double x) {
double d = static_cast<double>(xbits) - static_cast<double>(f);
d *= ONE_OVER_F[f_index];
- double extra_factor = fputil::fma(m, LOG_2, LOG_F[f_index]);
+ double extra_factor = fputil::multiply_add(m, LOG_2, LOG_F[f_index]);
double r = fputil::polyeval(d, extra_factor, 0x1.fffffffffffacp-1,
-0x1.fffffffef9cb2p-2, 0x1.5555513bc679ap-2,
@@ -161,7 +161,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) {
// > fpminimax(log(1 + x)/x, 5, [|D...|], [-2^-8; 2^-8]);
r = fputil::polyeval(xd, -0x1p-1, 0x1.5555555515551p-2, -0x1.ffffffff82bdap-3,
0x1.999b33348d3aep-3, -0x1.5556cae3adcc3p-3);
- return static_cast<float>(fputil::fma(r, xd * xd, xd));
+ return static_cast<float>(fputil::multiply_add(r, xd * xd, xd));
}
} // namespace __llvm_libc
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 3e712378b64c3..747f8c73c27c5 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -120,7 +120,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
d *= ONE_OVER_F[f_index];
double extra_factor =
- fputil::fma(static_cast<double>(m), LOG_2, LOG_F[f_index]);
+ fputil::multiply_add(static_cast<double>(m), LOG_2, LOG_F[f_index]);
double r = __llvm_libc::fputil::polyeval(
d, extra_factor, 0x1.fffffffffffacp-1, -0x1.fffffffef9cb2p-2,
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index ad8bcd0f476c1..93c45b93ad476 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -196,6 +196,54 @@ cc_library(
],
)
+fma_common_hdrs = [
+ "src/__support/FPUtil/FMA.h",
+ "src/__support/FPUtil/generic/FMA.h",
+]
+
+fma_hdrs = selects.with_or({
+ "//conditions:default": fma_common_hdrs,
+ PLATFORM_CPU_X86_64: fma_common_hdrs + [
+ "src/__support/FPUtil/x86_64/FMA.h",
+ ],
+ PLATFORM_CPU_ARM64: fma_common_hdrs + [
+ "src/__support/FPUtil/aarch64/FMA.h",
+ ],
+})
+
+cc_library(
+ name = "__support_fputil_fma",
+ hdrs = fma_hdrs,
+ deps = [
+ ":__support_common",
+ ":__support_cpp_bit",
+ ":__support_cpp_type_traits",
+ ":__support_fputil",
+ ":libc_root",
+ ],
+)
+
+cc_library(
+ name = "__support_fputil_multiply_add",
+ hdrs = [
+ "src/__support/FPUtil/multiply_add.h",
+ ],
+ deps = [
+ ":__support_common",
+ ":__support_fputil_fma",
+ ],
+)
+
+cc_library(
+ name = "__support_fputil_polyeval",
+ hdrs = [
+ "src/__support/FPUtil/PolyEval.h",
+ ],
+ deps = [
+ ":__support_fputil_multiply_add",
+ ],
+)
+
################################ fenv targets ################################
libc_function(
More information about the libc-commits
mailing list