[libc-commits] [libc] 5dbd511 - [libc][math] Improve tanhf performance.
Tue Ly via libc-commits
libc-commits at lists.llvm.org
Tue Jun 20 06:25:17 PDT 2023
Author: Tue Ly
Date: 2023-06-20T09:25:07-04:00
New Revision: 5dbd5118ec5435269602de30ebd4982dd8b273dc
URL: https://github.com/llvm/llvm-project/commit/5dbd5118ec5435269602de30ebd4982dd8b273dc
DIFF: https://github.com/llvm/llvm-project/commit/5dbd5118ec5435269602de30ebd4982dd8b273dc.diff
LOG: [libc][math] Improve tanhf performance.
Re-order exceptional branches and slightly adjust the evaluation.
Performance tested with the CORE-MATH project on AMD EPYC 7B12 (clocks/op)
Reciprocal throughputs:
```
--- BEFORE ---
$ CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 % (with -mavx2 -mfma)
Ntrial = 20 ; Min = 7.794 + 0.102 clc/call; Median-Min = 0.066 clc/call; Max = 8.267 clc/call;
[####################] 100 %. (with -msse4.2)
Ntrial = 20 ; Min = 10.783 + 0.172 clc/call; Median-Min = 0.144 clc/call; Max = 11.446 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 18.926 + 0.381 clc/call; Median-Min = 0.342 clc/call; Max = 19.623 clc/call;
--- AFTER ---
$ CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 % (with -mavx2 -mfma)
Ntrial = 20 ; Min = 6.598 + 0.085 clc/call; Median-Min = 0.052 clc/call; Max = 6.868 clc/call;
[####################] 100 % (with -msse4.2)
Ntrial = 20 ; Min = 9.245 + 0.304 clc/call; Median-Min = 0.248 clc/call; Max = 10.675 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 11.724 + 0.440 clc/call; Median-Min = 0.444 clc/call; Max = 12.262 clc/call;
```
Latency:
```
--- BEFORE ---
$ PERF_ARGS="--latency" CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 % (with -mavx2 -mfma)
Ntrial = 20 ; Min = 38.821 + 0.157 clc/call; Median-Min = 0.122 clc/call; Max = 39.539 clc/call;
[####################] 100 %. (with -msse4.2)
Ntrial = 20 ; Min = 44.767 + 0.766 clc/call; Median-Min = 0.681 clc/call; Max = 45.951 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 55.055 + 1.512 clc/call; Median-Min = 1.571 clc/call; Max = 57.039 clc/call;
--- AFTER ---
$ PERF_ARGS="--latency" CORE_MATH_PERF_MODE=rdtsc ./perf.sh tanhf
[####################] 100 % (with -mavx2 -mfma)
Ntrial = 20 ; Min = 36.147 + 0.194 clc/call; Median-Min = 0.181 clc/call; Max = 36.536 clc/call;
[####################] 100 % (with -msse4.2)
Ntrial = 20 ; Min = 40.904 + 0.728 clc/call; Median-Min = 0.557 clc/call; Max = 42.231 clc/call;
[####################] 100 %. (SSE2)
Ntrial = 20 ; Min = 55.776 + 0.557 clc/call; Median-Min = 0.542 clc/call; Max = 56.551 clc/call;
```
Reviewed By: michaelrj
Differential Revision: https://reviews.llvm.org/D153026
Added:
Modified:
libc/src/__support/macros/properties/CMakeLists.txt
libc/src/__support/macros/properties/cpu_features.h
libc/src/math/generic/CMakeLists.txt
libc/src/math/generic/tanhf.cpp
libc/test/src/math/tanhf_test.cpp
utils/bazel/llvm-project-overlay/libc/BUILD.bazel
Removed:
################################################################################
diff --git a/libc/src/__support/macros/properties/CMakeLists.txt b/libc/src/__support/macros/properties/CMakeLists.txt
index 8ddd16a1e3669..f6c88c34ebb5a 100644
--- a/libc/src/__support/macros/properties/CMakeLists.txt
+++ b/libc/src/__support/macros/properties/CMakeLists.txt
@@ -1,17 +1,19 @@
add_header_library(
- architectures
- HDRS
+ architectures
+ HDRS
architectures.h
)
add_header_library(
- compiler
- HDRS
+ compiler
+ HDRS
compiler.h
)
add_header_library(
- cpu_features
- HDRS
+ cpu_features
+ HDRS
cpu_features.h
+ DEPENDS
+ .architectures
)
diff --git a/libc/src/__support/macros/properties/cpu_features.h b/libc/src/__support/macros/properties/cpu_features.h
index 086b216e27573..493d9f446d374 100644
--- a/libc/src/__support/macros/properties/cpu_features.h
+++ b/libc/src/__support/macros/properties/cpu_features.h
@@ -12,6 +12,8 @@
#ifndef LLVM_LIBC_SRC_SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
#define LLVM_LIBC_SRC_SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
+#include "architectures.h"
+
#if defined(__SSE2__)
#define LIBC_TARGET_CPU_HAS_SSE2
#endif
@@ -41,4 +43,10 @@
#define LIBC_TARGET_CPU_HAS_FMA
#endif
+#if defined(LIBC_TARGET_ARCH_IS_AARCH64) || \
+ (defined(LIBC_TARGET_ARCH_IS_X86_64) && \
+ defined(LIBC_TARGET_CPU_HAS_SSE4_2))
+#define LIBC_TARGET_CPU_HAS_NEAREST_INT
+#endif
+
#endif // LLVM_LIBC_SRC_SUPPORT_MACROS_PROPERTIES_CPU_FEATURES_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 3d5273fa90cc1..098b69817fb20 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1374,6 +1374,8 @@ add_entrypoint_object(
.explogxf
libc.src.__support.FPUtil.fp_bits
libc.src.__support.FPUtil.rounding_mode
+ libc.src.__support.FPUtil.multiply_add
+ libc.src.__support.FPUtil.polyeval
libc.src.__support.macros.optimization
COMPILE_OPTIONS
-O3
diff --git a/libc/src/math/generic/tanhf.cpp b/libc/src/math/generic/tanhf.cpp
index eb6b50a64b014..21c3ed8c8c333 100644
--- a/libc/src/math/generic/tanhf.cpp
+++ b/libc/src/math/generic/tanhf.cpp
@@ -8,66 +8,114 @@
#include "src/math/tanhf.h"
#include "src/__support/FPUtil/FPBits.h"
-#include "src/__support/FPUtil/rounding_mode.h"
-#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
-#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h"
#include "src/math/generic/explogxf.h"
namespace __llvm_libc {
+// 2^6 * log2(e)
+constexpr double LOG2_E_EXP2_6 = ExpBase::LOG2_B * 2.0;
+
LLVM_LIBC_FUNCTION(float, tanhf, (float x)) {
using FPBits = typename fputil::FPBits<float>;
FPBits xbits(x);
- bool sign = xbits.get_sign();
- uint32_t x_abs = xbits.uintval() & FPBits::FloatProp::EXP_MANT_MASK;
+ uint32_t x_u = xbits.uintval();
+ uint32_t x_abs = x_u & FPBits::FloatProp::EXP_MANT_MASK;
- // |x| <= 2^-26
- if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
- return static_cast<float>(
- LIBC_UNLIKELY(x_abs == 0) ? x : (x - 0x1.5555555555555p-2 * x * x * x));
- }
+ // When |x| >= 15, or x is inf or nan, or |x| <= 0.078125
+ if (LIBC_UNLIKELY((x_abs >= 0x4170'0000U) || (x_abs <= 0x3da0'0000U))) {
+ if (x_abs <= 0x3da0'0000U) {
+ // |x| <= 0.078125
+ if (LIBC_UNLIKELY(x_abs <= 0x3280'0000U)) {
+ // |x| <= 2^-26
+ return (x_abs != 0)
+ ? static_cast<float>(x - 0x1.5555555555555p-2 * x * x * x)
+ : x;
+ }
+
+ const double TAYLOR[] = {-0x1.5555555555555p-2, 0x1.1111111111111p-3,
+ -0x1.ba1ba1ba1ba1cp-5, 0x1.664f4882c10fap-6,
+ -0x1.226e355e6c23dp-7};
+ double xdbl = x;
+ double x2 = xdbl * xdbl;
+ // Taylor polynomial.
+ double x4 = x2 * x2;
+ double c0 = x2 * TAYLOR[0];
+ double c1 = fputil::multiply_add(x2, TAYLOR[2], TAYLOR[1]);
+ double c2 = fputil::multiply_add(x2, TAYLOR[4], TAYLOR[3]);
+ double pe = fputil::polyeval(x4, c0, c1, c2);
- // When |x| >= 15, or x is inf or nan
- if (LIBC_UNLIKELY(x_abs >= 0x4170'0000U)) {
- if (xbits.is_nan())
+ return static_cast<float>(fputil::multiply_add(xdbl, pe, xdbl));
+ }
+
+ // |x| >= 15
+ if (LIBC_UNLIKELY(xbits.is_nan()))
return x + 1.0f; // sNaN to qNaN + signal
- if (xbits.is_inf())
- return sign ? -1.0f : 1.0f;
+ const double SIGNS[2][2] = {{1.0f, -0x1.0p-25f}, {-1.0f, 0x1.0p-25f}};
- if (sign) {
- return -1.0f + opt_barrier(FPBits(FPBits::MIN_NORMAL).get_val());
- } else
- return 1.0f - opt_barrier(FPBits(FPBits::MIN_NORMAL).get_val());
- }
+ bool sign = xbits.get_sign();
+ int idx = static_cast<int>(sign);
- // |x| <= 0.078125
- if (LIBC_UNLIKELY(x_abs <= 0x3da0'0000U)) {
- double xdbl = x;
- double x2 = xdbl * xdbl;
- // Pure Taylor series.
- double pe = fputil::polyeval(x2, 0.0, -0x1.5555555555555p-2,
- 0x1.1111111111111p-3, -0x1.ba1ba1ba1ba1cp-5,
- 0x1.664f4882c10fap-6, -0x1.226e355e6c23dp-7);
- return static_cast<float>(fputil::multiply_add(xdbl, pe, xdbl));
- }
+ if (LIBC_UNLIKELY(xbits.is_inf()))
+ return SIGNS[idx][0];
- if (LIBC_UNLIKELY(xbits.bits == 0x4058'e0a3U)) {
- if (fputil::fenv_is_round_down())
- return FPBits(0x3f7f'6ad9U).get_val();
+ return SIGNS[idx][0] + SIGNS[idx][1];
}
- // Range reduction: e^(2x) = 2^(mid + hi) * e^lo
- auto ep = exp_b_range_reduc<ExpBase>(2.0f * x); // exp(2 * x)
- double r = ExpBase::powb_lo(ep.lo);
- // tanh(x) = (exp(2x) - 1) / (exp(2x) + 1)
-#if defined(LIBC_TARGET_CPU_HAS_FMA)
- return static_cast<float>(fputil::multiply_add(ep.mh, r, -1.0) /
- fputil::multiply_add(ep.mh, r, 1.0));
+ // Range reduction: e^(2x) = 2^(hi + mid) * e^lo
+ // Let k = round( x * 2^6 * log2(e)),
+ // So k = (hi + mid) * 2^5
+ // Then lo = 2x - (hi + mid) * log(2) = 2x - k * 2^-5 * log(2).
+
+ double xd = static_cast<double>(x);
+ // k = round( x* 2^6 * log2(e) )
+ double k;
+ // mk = -k
+ int mk;
+#ifdef LIBC_TARGET_CPU_HAS_NEAREST_INT
+ k = fputil::nearest_integer(xd * LOG2_E_EXP2_6);
+ mk = -static_cast<int>(k);
#else
- double exp_x = ep.mh * r;
- return static_cast<float>((exp_x - 1.0) / (exp_x + 1.0));
-#endif // LIBC_TARGET_CPU_HAS_FMA
+ constexpr double HALF_WAY[2] = {-0.5, 0.5};
+
+ mk = static_cast<int>(
+ fputil::multiply_add(xd, -LOG2_E_EXP2_6, HALF_WAY[xbits.get_sign()]));
+ k = static_cast<double>(-mk);
+#endif // LIBC_TARGET_CPU_HAS_NEAREST_INT
+ // -hi = floor(-k * 2^(-MID_BITS))
+ // exp_mhi = shift -hi to the exponent field of double precision.
+ int64_t exp_mhi = static_cast<int64_t>(mk >> ExpBase::MID_BITS)
+ << fputil::FloatProperties<double>::MANTISSA_WIDTH;
+ // mh = 2^(-hi - mid)
+ int64_t mh_bits = ExpBase::EXP_2_MID[mk & ExpBase::MID_MASK] + exp_mhi;
+ double mh = fputil::FPBits<double>(uint64_t(mh_bits)).get_val();
+ // dx = lo/2 = x - (hi + mid) * log(2)/2 = x - k * 2^-6 * log(2)
+ double dx = fputil::multiply_add(
+ k, ExpBase::M_LOGB_2_LO * 0.5,
+ fputil::multiply_add(k, ExpBase::M_LOGB_2_HI * 0.5, xd));
+
+ // > P = fpminimax(expm1(2*x)/x, 4, [|D...|], [-log(2)/128, log(2)/128]);
+ constexpr double COEFFS[] = {0x1.ffffffffe5bc8p0, 0x1.555555555cd67p0,
+ 0x1.5555c2a9b48b4p-1, 0x1.11112a0e34bdbp-2};
+
+ double dx2 = dx * dx;
+ double c0 = fputil::multiply_add(dx, 2.0, 1.0);
+ double c1 = fputil::multiply_add(dx, COEFFS[1], COEFFS[0]);
+ double c2 = fputil::multiply_add(dx, COEFFS[3], COEFFS[2]);
+ double r = fputil::polyeval(dx2, c0, c1, c2);
+
+ // tanh(x) = sinh(x) / cosh(x)
+ // = (e^x - e^(-x)) / (e^x + e^(-x))
+ // = (e^(2x) - 1) / (e^(2x) + 1)
+ // = (2^(hi + mid) * e^lo - 1) / (2^(hi + mid) * e^lo + 1)
+ // = (e^lo - 2^(-hi - mid)) / (e^lo + 2^(-hi - mid))
+ // = (r - mh) / (r + mh)
+ return static_cast<float>((r - mh) / (r + mh));
}
} // namespace __llvm_libc
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index f4a4b72dd162a..07afe14a263d0 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -43,35 +43,31 @@ TEST(LlvmLibcTanhfTest, SpecialNumbers) {
}
TEST(LlvmLibcTanhfTest, InFloatRange) {
- constexpr uint32_t COUNT = 100'000;
+ constexpr uint32_t COUNT = 100'001;
constexpr uint32_t STEP = UINT32_MAX / COUNT;
for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
float x = float(FPBits(v));
if (isnan(x) || isinf(x))
continue;
- ASSERT_MPFR_MATCH(mpfr::Operation::Tanh, x, __llvm_libc::tanhf(x), 0.5);
+ ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
+ __llvm_libc::tanhf(x), 0.5);
}
}
-// For small values, tanh(x) is x.
-TEST(LlvmLibcTanhfTest, SmallValues) {
- float x = float(FPBits(uint32_t(0x17800000)));
- float result = __llvm_libc::tanhf(x);
- EXPECT_MPFR_MATCH(mpfr::Operation::Tanh, x, result, 0.5);
- EXPECT_FP_EQ(x, result);
-
- x = float(FPBits(uint32_t(0x00400000)));
- result = __llvm_libc::tanhf(x);
- EXPECT_MPFR_MATCH(mpfr::Operation::Tanh, x, result, 0.5);
- EXPECT_FP_EQ(x, result);
-}
-
TEST(LlvmLibcTanhfTest, ExceptionalValues) {
- float x = float(FPBits(uint32_t(0x3a12'85ffU)));
- EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
- __llvm_libc::tanhf(x), 0.5);
-
- x = -float(FPBits(uint32_t(0x3a12'85ffU)));
- EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
- __llvm_libc::tanhf(x), 0.5);
+ constexpr int N = 4;
+ constexpr uint32_t INPUTS[N] = {
+ 0x0040'0000,
+ 0x1780'0000,
+ 0x3a12'85ff,
+ 0x4058'e0a3,
+ };
+
+ for (int i = 0; i < N; ++i) {
+ float x = float(FPBits(INPUTS[i]));
+ EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
+ __llvm_libc::tanhf(x), 0.5);
+ EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, -x,
+ __llvm_libc::tanhf(-x), 0.5);
+ }
}
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 58aa99d007743..eb7a3c87fc0f7 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -81,7 +81,10 @@ libc_support_library(
libc_support_library(
name = "__support_macros_properties_cpu_features",
hdrs = ["src/__support/macros/properties/cpu_features.h"],
- deps = [":libc_root"],
+ deps = [
+ "__support_macros_properties_architectures",
+ ":libc_root",
+ ],
)
libc_support_library(
More information about the libc-commits
mailing list