[libc-commits] [libc] a205a85 - [libc][math] Improve fmul performance by using double-double arithmetic. (#107517)
via libc-commits
libc-commits at lists.llvm.org
Sat Sep 14 14:32:26 PDT 2024
Author: Job Henandez Lara
Date: 2024-09-14T17:32:22-04:00
New Revision: a205a854e06d36c1d0def3e3bc3743defdb6abc1
URL: https://github.com/llvm/llvm-project/commit/a205a854e06d36c1d0def3e3bc3743defdb6abc1
DIFF: https://github.com/llvm/llvm-project/commit/a205a854e06d36c1d0def3e3bc3743defdb6abc1.diff
LOG: [libc][math] Improve fmul performance by using double-double arithmetic. (#107517)
```
Performance tests with inputs in denormal range:
-- My function --
Total time : 2731072304 ns
Average runtime : 68.2767 ns/op
Ops per second : 14646276 op/s
-- Other function --
Total time : 3259744268 ns
Average runtime : 81.4935 ns/op
Ops per second : 12270913 op/s
-- Average runtime ratio --
Mine / Other's : 0.837818
Performance tests with inputs in normal range:
-- My function --
Total time : 93467258 ns
Average runtime : 2.33668 ns/op
Ops per second : 427957777 op/s
-- Other function --
Total time : 637295452 ns
Average runtime : 15.9324 ns/op
Ops per second : 62765299 op/s
-- Average runtime ratio --
Mine / Other's : 0.146662
Performance tests with inputs in normal range with exponents close to each other:
-- My function --
Total time : 95764894 ns
Average runtime : 2.39412 ns/op
Ops per second : 417690014 op/s
-- Other function --
Total time : 639866770 ns
Average runtime : 15.9967 ns/op
Ops per second : 62513075 op/s
-- Average runtime ratio --
Mine / Other's : 0.149664
```
---------
Co-authored-by: Tue Ly <lntue at google.com>
Added:
Modified:
libc/src/math/generic/CMakeLists.txt
libc/src/math/generic/fmul.cpp
libc/test/src/math/fmul_test.cpp
libc/test/src/math/performance_testing/CMakeLists.txt
libc/test/src/math/performance_testing/fmul_perf.cpp
libc/test/src/math/smoke/fmul_test.cpp
Removed:
################################################################################
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 350072f4b9649d..5a1ee3b8b83c77 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -2958,7 +2958,9 @@ add_entrypoint_object(
HDRS
../fmul.h
DEPENDS
- libc.src.__support.FPUtil.generic.mul
+ libc.hdr.errno_macros
+ libc.hdr.fenv_macros
+ libc.src.__support.FPUtil.double_double
COMPILE_OPTIONS
-O3
)
diff --git a/libc/src/math/generic/fmul.cpp b/libc/src/math/generic/fmul.cpp
index 64c27d6e2f9564..e759e48cd6989a 100644
--- a/libc/src/math/generic/fmul.cpp
+++ b/libc/src/math/generic/fmul.cpp
@@ -5,8 +5,10 @@
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
-
#include "src/math/fmul.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/double_double.h"
#include "src/__support/FPUtil/generic/mul.h"
#include "src/__support/common.h"
#include "src/__support/macros/config.h"
@@ -14,7 +16,104 @@
namespace LIBC_NAMESPACE_DECL {
LLVM_LIBC_FUNCTION(float, fmul, (double x, double y)) {
+
+ // Without FMA instructions, fputil::exact_mult is not
+ // correctly rounded for all rounding modes, so we fall
+ // back to the generic `fmul` implementation
+
+#ifndef LIBC_TARGET_CPU_HAS_FMA
return fputil::generic::mul<float>(x, y);
-}
+#else
+ fputil::DoubleDouble prod = fputil::exact_mult(x, y);
+ using DoubleBits = fputil::FPBits<double>;
+ using DoubleStorageType = typename DoubleBits::StorageType;
+ using FloatBits = fputil::FPBits<float>;
+ using FloatStorageType = typename FloatBits::StorageType;
+ DoubleBits x_bits(x);
+ DoubleBits y_bits(y);
+
+ Sign result_sign = x_bits.sign() == y_bits.sign() ? Sign::POS : Sign::NEG;
+ double result = prod.hi;
+ DoubleBits hi_bits(prod.hi), lo_bits(prod.lo);
+ // Check for cases where we need to propagate the sticky bits:
+ constexpr uint64_t STICKY_MASK = 0xFFF'FFF; // Lower (52 - 23 - 1 = 28 bits)
+ uint64_t sticky_bits = (hi_bits.uintval() & STICKY_MASK);
+ if (LIBC_UNLIKELY(sticky_bits == 0)) {
+ // Might need to propagate sticky bits:
+ if (!(lo_bits.is_inf_or_nan() || lo_bits.is_zero())) {
+ // Now prod.lo is nonzero and finite, we need to propagate sticky bits.
+ if (lo_bits.sign() != hi_bits.sign())
+ result = DoubleBits(hi_bits.uintval() - 1).get_val();
+ else
+ result = DoubleBits(hi_bits.uintval() | 1).get_val();
+ }
+ }
+
+ float result_f = static_cast<float>(result);
+ FloatBits rf_bits(result_f);
+ uint32_t rf_exp = rf_bits.get_biased_exponent();
+ if (LIBC_LIKELY(rf_exp > 0 && rf_exp < 2 * FloatBits::EXP_BIAS + 1)) {
+ return result_f;
+ }
+
+ // Now result_f is either inf/nan/zero/denormal.
+ if (x_bits.is_nan() || y_bits.is_nan()) {
+ if (x_bits.is_signaling_nan() || y_bits.is_signaling_nan())
+ fputil::raise_except_if_required(FE_INVALID);
+
+ if (x_bits.is_quiet_nan()) {
+ DoubleStorageType x_payload = x_bits.get_mantissa();
+ x_payload >>= DoubleBits::FRACTION_LEN - FloatBits::FRACTION_LEN;
+ return FloatBits::quiet_nan(x_bits.sign(),
+ static_cast<FloatStorageType>(x_payload))
+ .get_val();
+ }
+
+ if (y_bits.is_quiet_nan()) {
+ DoubleStorageType y_payload = y_bits.get_mantissa();
+ y_payload >>= DoubleBits::FRACTION_LEN - FloatBits::FRACTION_LEN;
+ return FloatBits::quiet_nan(y_bits.sign(),
+ static_cast<FloatStorageType>(y_payload))
+ .get_val();
+ }
+
+ return FloatBits::quiet_nan().get_val();
+ }
+ if (x_bits.is_inf()) {
+ if (y_bits.is_zero()) {
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_INVALID);
+
+ return FloatBits::quiet_nan().get_val();
+ }
+
+ return FloatBits::inf(result_sign).get_val();
+ }
+
+ if (y_bits.is_inf()) {
+ if (x_bits.is_zero()) {
+ fputil::set_errno_if_required(EDOM);
+ fputil::raise_except_if_required(FE_INVALID);
+ return FloatBits::quiet_nan().get_val();
+ }
+
+ return FloatBits::inf(result_sign).get_val();
+ }
+
+ // Now either x or y is zero, and the other one is finite.
+ if (rf_bits.is_inf()) {
+ fputil::set_errno_if_required(ERANGE);
+ return FloatBits::inf(result_sign).get_val();
+ }
+
+ if (x_bits.is_zero() || y_bits.is_zero())
+ return FloatBits::zero(result_sign).get_val();
+
+ fputil::set_errno_if_required(ERANGE);
+ fputil::raise_except_if_required(FE_UNDERFLOW);
+ return result_f;
+
+#endif
+}
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/math/fmul_test.cpp b/libc/test/src/math/fmul_test.cpp
index 3f6df66456bac5..488e087a325205 100644
--- a/libc/test/src/math/fmul_test.cpp
+++ b/libc/test/src/math/fmul_test.cpp
@@ -10,4 +10,27 @@
#include "src/math/fmul.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
+
+TEST_F(LlvmLibcMulTest, SpecialInputs) {
+ namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+ double INPUTS[][2] = {
+ {0x1.0100010002p8, 0x1.fffcp14},
+ {0x1.000000b92144p-7, 0x1.62p7},
+ };
+
+ for (size_t i = 0; i < 2; ++i) {
+ double a = INPUTS[i][0];
+
+ for (int j = 0; j < 180; ++j) {
+ a *= 0.5;
+ mpfr::BinaryInput<double> input{a, INPUTS[i][1]};
+ ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Mul, input,
+ LIBC_NAMESPACE::fmul(a, INPUTS[i][1]),
+ 0.5);
+ }
+ }
+}
diff --git a/libc/test/src/math/performance_testing/CMakeLists.txt b/libc/test/src/math/performance_testing/CMakeLists.txt
index ed1b03f3493c7d..60c074a248f72a 100644
--- a/libc/test/src/math/performance_testing/CMakeLists.txt
+++ b/libc/test/src/math/performance_testing/CMakeLists.txt
@@ -484,6 +484,8 @@ add_perf_binary(
DEPENDS
.binary_op_single_output_
diff
libc.src.math.fmul
+ libc.src.__support.FPUtil.generic.mul
+ libc.src.__support.FPUtil.fp_bits
COMPILE_OPTIONS
-fno-builtin
)
diff --git a/libc/test/src/math/performance_testing/fmul_perf.cpp b/libc/test/src/math/performance_testing/fmul_perf.cpp
index a215405eb6aa5d..f15cfafbf29451 100644
--- a/libc/test/src/math/performance_testing/fmul_perf.cpp
+++ b/libc/test/src/math/performance_testing/fmul_perf.cpp
@@ -7,12 +7,13 @@
//===----------------------------------------------------------------------===//
#include "BinaryOpSingleOutputPerf.h"
+#include "src/__support/FPUtil/generic/mul.h"
#include "src/math/fmul.h"
static constexpr size_t DOUBLE_ROUNDS = 40;
float fmul_placeholder_binary(double x, double y) {
- return static_cast<float>(x * y);
+ return LIBC_NAMESPACE::fputil::generic::mul<float>(x, y);
}
int main() {
diff --git a/libc/test/src/math/smoke/fmul_test.cpp b/libc/test/src/math/smoke/fmul_test.cpp
index 3f6df66456bac5..3fcf514bcd9f05 100644
--- a/libc/test/src/math/smoke/fmul_test.cpp
+++ b/libc/test/src/math/smoke/fmul_test.cpp
@@ -11,3 +11,22 @@
#include "src/math/fmul.h"
LIST_MUL_TESTS(float, double, LIBC_NAMESPACE::fmul)
+
+TEST_F(LlvmLibcMulTest, SpecialInputs) {
+ constexpr double INPUTS[][2] = {
+ {0x1.0100010002p8, 0x1.fffcp14},
+ {0x1.000000b92144p-7, 0x1.62p7},
+ };
+
+ constexpr float RESULTS[] = {
+ 0x1.00fdfep+23f,
+ 0x1.620002p0f,
+ };
+
+ constexpr size_t N = sizeof(RESULTS) / sizeof(RESULTS[0]);
+
+ for (size_t i = 0; i < N; ++i) {
+ float result = LIBC_NAMESPACE::fmul(INPUTS[i][0], INPUTS[i][1]);
+ EXPECT_FP_EQ(RESULTS[i], result);
+ }
+}
More information about the libc-commits
mailing list