[libc-commits] [libc] d680258 - [libc] Implement a high-precision floating point class.
Tue Ly via libc-commits
libc-commits at lists.llvm.org
Tue Dec 13 21:37:44 PST 2022
Author: Tue Ly
Date: 2022-12-14T00:37:14-05:00
New Revision: d6802581700bdf65fc6d002d3d5667295044f0b3
URL: https://github.com/llvm/llvm-project/commit/d6802581700bdf65fc6d002d3d5667295044f0b3
DIFF: https://github.com/llvm/llvm-project/commit/d6802581700bdf65fc6d002d3d5667295044f0b3.diff
LOG: [libc] Implement a high-precision floating point class.
Implement a high-precision floating point class using UInt<> as its
mantissa. This will be used in accurate pass for double precision math
functions.
Reviewed By: sivachandra
Differential Revision: https://reviews.llvm.org/D136799
Added:
libc/src/__support/FPUtil/dyadic_float.h
libc/test/src/__support/FPUtil/CMakeLists.txt
libc/test/src/__support/FPUtil/dyadic_float_test.cpp
Modified:
libc/src/__support/FPUtil/CMakeLists.txt
libc/src/__support/UInt.h
libc/test/src/__support/CMakeLists.txt
libc/utils/UnitTest/CMakeLists.txt
libc/utils/UnitTest/FPMatcher.h
Removed:
################################################################################
diff --git a/libc/src/__support/FPUtil/CMakeLists.txt b/libc/src/__support/FPUtil/CMakeLists.txt
index 3377fab423457..0686c01e7feef 100644
--- a/libc/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/src/__support/FPUtil/CMakeLists.txt
@@ -178,4 +178,16 @@ add_header_library(
ROUND_OPT
)
+add_header_library(
+ dyadic_float
+ HDRS
+ dyadic_float.h
+ DEPENDS
+ .float_properties
+ .fp_bits
+ .multiply_add
+ libc.src.__support.common
+ libc.src.__support.uint
+)
+
add_subdirectory(generic)
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
new file mode 100644
index 0000000000000..8b76725364152
--- /dev/null
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -0,0 +1,212 @@
+//===-- A class to store high precision floating point numbers --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SUPPORT_FPUTIL_DYADIC_FLOAT_H
+#define LLVM_LIBC_SRC_SUPPORT_FPUTIL_DYADIC_FLOAT_H
+
+#include "FPBits.h"
+#include "FloatProperties.h"
+#include "multiply_add.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/UInt.h"
+
+#include <stddef.h>
+
+namespace __llvm_libc::fputil {
+
+// A generic class to perform comuptations of high precision floating points.
+// We store the value in dyadic format, including 3 fields:
+// sign : boolean value - false means positive, true means negative
+// exponent: the exponent value of the least significant bit of the mantissa.
+// mantissa: unsigned integer of length `Bits`.
+// So the real value that is stored is:
+// real value = (-1)^sign * 2^exponent * (mantissa as unsigned integer)
+// The stored data is normal if for non-zero mantissa, the leading bit is 1.
+// The outputs of the constructors and most functions will be normalized.
+// To simplify and improve the efficiency, many functions will assume that the
+// inputs are normal.
+template <size_t Bits> struct DyadicFloat {
+ using MantissaType = __llvm_libc::cpp::UInt<Bits>;
+
+ bool sign = false;
+ int exponent = 0;
+ MantissaType mantissa = MantissaType(0);
+
+ DyadicFloat() = default;
+
+ template <typename T,
+ cpp::enable_if_t<cpp::is_floating_point_v<T> &&
+ (FloatProperties<T>::MANTISSA_WIDTH < Bits),
+ int> = 0>
+ DyadicFloat(T x) {
+ FPBits<T> x_bits(x);
+ sign = x_bits.get_sign();
+ exponent = x_bits.get_exponent() - FloatProperties<T>::MANTISSA_WIDTH;
+ mantissa = MantissaType(x_bits.get_explicit_mantissa());
+ normalize();
+ }
+
+ DyadicFloat(bool s, int e, MantissaType m)
+ : sign(s), exponent(e), mantissa(m) {
+ normalize();
+ };
+
+ // Normalizing the mantissa, bringing the leading 1 bit to the most
+ // significant bit.
+ DyadicFloat &normalize() {
+ if (!mantissa.is_zero()) {
+ int shift_length = static_cast<int>(mantissa.clz());
+ exponent -= shift_length;
+ mantissa.shift_left(static_cast<size_t>(shift_length));
+ }
+ return *this;
+ }
+
+ // Used for aligning exponents. Output might not be normalized.
+ DyadicFloat &shift_left(int shift_length) {
+ exponent -= shift_length;
+ mantissa <<= static_cast<size_t>(shift_length);
+ return *this;
+ }
+
+ // Used for aligning exponents. Output might not be normalized.
+ DyadicFloat &shift_right(int shift_length) {
+ exponent += shift_length;
+ mantissa >>= static_cast<size_t>(shift_length);
+ return *this;
+ }
+
+ // Assume that it is already normalized and output is also normal.
+ // Output is rounded correctly with respect to the current rounding mode.
+ // TODO(lntue): Test or add support for denormal output.
+ // TODO(lntue): Test or add specialization for x86 long double.
+ template <typename T, typename = cpp::enable_if_t<
+ cpp::is_floating_point_v<T> &&
+ (FloatProperties<T>::MANTISSA_WIDTH < Bits),
+ void>>
+ explicit operator T() const {
+ // TODO(lntue): Do we need to treat signed zeros properly?
+ if (mantissa.is_zero())
+ return 0.0;
+
+ // Assume that it is normalized, and output is also normal.
+ constexpr size_t PRECISION = FloatProperties<T>::MANTISSA_WIDTH + 1;
+ using output_bits_t = typename FPBits<T>::UIntType;
+
+ MantissaType m_hi(mantissa >> (Bits - PRECISION));
+ auto d_hi = FPBits<T>::create_value(
+ sign, exponent + (Bits - 1) + FloatProperties<T>::EXPONENT_BIAS,
+ output_bits_t(m_hi) & FloatProperties<T>::MANTISSA_MASK);
+
+ const MantissaType ROUND_MASK = MantissaType(1) << (Bits - PRECISION - 1);
+ const MantissaType STICKY_MASK = ROUND_MASK - MantissaType(1);
+
+ bool round_bit = !(mantissa & ROUND_MASK).is_zero();
+ bool sticky_bit = !(mantissa & STICKY_MASK).is_zero();
+ int round_and_sticky = int(round_bit) * 2 + int(sticky_bit);
+ auto d_lo = FPBits<T>::create_value(sign,
+ exponent + (Bits - PRECISION - 2) +
+ FloatProperties<T>::EXPONENT_BIAS,
+ output_bits_t(0));
+
+ // Still correct without FMA instructions if `d_lo` is not underflow.
+ return multiply_add(d_lo.get_val(), T(round_and_sticky), d_hi.get_val());
+ }
+};
+
+// Quick add - Add 2 dyadic floats with rounding toward 0 and then normalize the
+// output:
+// - Align the exponents so that:
+// new a.exponent = new b.exponent = max(a.exponent, b.exponent)
+// - Add or subtract the mantissas depending on the signs.
+// - Normalize the result.
+// The absolute errors compared to the mathematical sum is bounded by:
+// | quick_add(a, b) - (a + b) | < MSB(a + b) * 2^(-Bits + 2),
+// i.e., errors are up to 2 ULPs.
+// Assume inputs are normalized (by constructors or other functions) so that we
+// don't need to normalize the inputs again in this function. If the inputs are
+// not normalized, the results might lose precision significantly.
+template <size_t Bits>
+constexpr DyadicFloat<Bits> quick_add(DyadicFloat<Bits> a,
+ DyadicFloat<Bits> b) {
+ if (unlikely(a.mantissa.is_zero()))
+ return b;
+ if (unlikely(b.mantissa.is_zero()))
+ return a;
+
+ // Align exponents
+ if (a.exponent > b.exponent)
+ b.shift_right(a.exponent - b.exponent);
+ else if (b.exponent > a.exponent)
+ a.shift_right(b.exponent - a.exponent);
+
+ DyadicFloat<Bits> result;
+
+ if (a.sign == b.sign) {
+ // Addition
+ result.sign = a.sign;
+ result.exponent = a.exponent;
+ result.mantissa = a.mantissa;
+ if (result.mantissa.add(b.mantissa)) {
+ // Mantissa addition overflow.
+ result.shift_right(1);
+ result.mantissa.val[DyadicFloat<Bits>::MantissaType::WordCount - 1] |=
+ (uint64_t(1) << 63);
+ }
+ // Result is already normalized.
+ return result;
+ }
+
+ // Subtraction
+ if (a.mantissa >= b.mantissa) {
+ result.sign = a.sign;
+ result.exponent = a.exponent;
+ result.mantissa = a.mantissa - b.mantissa;
+ } else {
+ result.sign = b.sign;
+ result.exponent = b.exponent;
+ result.mantissa = b.mantissa - a.mantissa;
+ }
+
+ return result.normalize();
+}
+
+// Quick Mul - Slightly less accurate but efficient multiplication of 2 dyadic
+// floats with rounding toward 0 and then normalize the output:
+// result.exponent = a.exponent + b.exponent + Bits,
+// result.mantissa = quick_mul_hi(a.mantissa + b.mantissa)
+// ~ (full product a.mantissa * b.mantissa) >> Bits.
+// The errors compared to the mathematical product is bounded by:
+// 2 * errors of quick_mul_hi = 2 * (UInt<Bits>::WordCount - 1) in ULPs.
+// Assume inputs are normalized (by constructors or other functions) so that we
+// don't need to normalize the inputs again in this function. If the inputs are
+// not normalized, the results might lose precision significantly.
+template <size_t Bits>
+constexpr DyadicFloat<Bits> quick_mul(DyadicFloat<Bits> a,
+ DyadicFloat<Bits> b) {
+ DyadicFloat<Bits> result;
+ result.sign = (a.sign != b.sign);
+ result.exponent = a.exponent + b.exponent + int(Bits);
+
+ if (!(a.mantissa.is_zero() || b.mantissa.is_zero())) {
+ result.mantissa = a.mantissa.quick_mul_hi(b.mantissa);
+ // Check the leading bit directly, should be faster than using clz in
+ // normalize().
+ if (result.mantissa.val[DyadicFloat<Bits>::MantissaType::WordCount - 1] >>
+ 63 ==
+ 0)
+ result.shift_left(1);
+ } else {
+ result.mantissa = (typename DyadicFloat<Bits>::MantissaType)(0);
+ }
+ return result;
+}
+
+} // namespace __llvm_libc::fputil
+
+#endif // LLVM_LIBC_SRC_SUPPORT_FPUTIL_DYADIC_FLOAT_H
diff --git a/libc/src/__support/UInt.h b/libc/src/__support/UInt.h
index 6068ab6decfd9..e4d8a8f580ec0 100644
--- a/libc/src/__support/UInt.h
+++ b/libc/src/__support/UInt.h
@@ -14,6 +14,7 @@
#include "src/__support/CPP/optional.h"
#include "src/__support/CPP/type_traits.h"
#include "src/__support/builtin_wrappers.h"
+#include "src/__support/common.h"
#include "src/__support/integer_utils.h"
#include "src/__support/number_pair.h"
@@ -95,6 +96,14 @@ template <size_t Bits> struct UInt {
return *this;
}
+ constexpr bool is_zero() const {
+ for (size_t i = 0; i < WordCount; ++i) {
+ if (val[i] != 0)
+ return false;
+ }
+ return true;
+ }
+
// Add x to this number and store the result in this number.
// Returns the carry value produced by the addition operation.
constexpr uint64_t add(const UInt<Bits> &x) {
@@ -356,6 +365,9 @@ template <size_t Bits> struct UInt {
return;
}
#endif // __SIZEOF_INT128__
+ if (unlikely(s == 0))
+ return;
+
const size_t drop = s / 64; // Number of words to drop
const size_t shift = s % 64; // Bits to shift in the remaining words.
size_t i = WordCount;
@@ -402,6 +414,8 @@ template <size_t Bits> struct UInt {
}
#endif // __SIZEOF_INT128__
+ if (unlikely(s == 0))
+ return;
const size_t drop = s / 64; // Number of words to drop
const size_t shift = s % 64; // Bit shift in the remaining words.
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index c1b04266267b5..b835812f6e1b8 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -111,3 +111,4 @@ add_custom_command(TARGET libc_str_to_float_comparison_test
add_subdirectory(CPP)
add_subdirectory(File)
add_subdirectory(OSUtil)
+add_subdirectory(FPUtil)
diff --git a/libc/test/src/__support/FPUtil/CMakeLists.txt b/libc/test/src/__support/FPUtil/CMakeLists.txt
new file mode 100644
index 0000000000000..47d88b169c588
--- /dev/null
+++ b/libc/test/src/__support/FPUtil/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_libc_testsuite(libc_fputil_unittests)
+
+add_fp_unittest(
+ dyadic_float_test
+ NEED_MPFR
+ SUITE
+ libc_fputil_unittests
+ SRCS
+ dyadic_float_test.cpp
+ DEPENDS
+ libc.src.__support.FPUtil.dyadic_float
+)
diff --git a/libc/test/src/__support/FPUtil/dyadic_float_test.cpp b/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
new file mode 100644
index 0000000000000..530b1b160a0fd
--- /dev/null
+++ b/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
@@ -0,0 +1,67 @@
+//===-- Unittests for the DyadicFloat class -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/FPUtil/dyadic_float.h"
+#include "src/__support/UInt.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+#include "utils/UnitTest/FPMatcher.h"
+#include "utils/UnitTest/Test.h"
+
+using Float128 = __llvm_libc::fputil::DyadicFloat<128>;
+using Float192 = __llvm_libc::fputil::DyadicFloat<192>;
+using Float256 = __llvm_libc::fputil::DyadicFloat<256>;
+
+TEST(LlvmLibcDyadicFloatTest, BasicConversions) {
+ Float128 x(/*sign*/ false, /*exponent*/ 0,
+ /*mantissa*/ Float128::MantissaType(1));
+ volatile float xf = float(x);
+ volatile double xd = double(x);
+ ASSERT_FP_EQ(1.0f, xf);
+ ASSERT_FP_EQ(1.0, xd);
+
+ Float128 y(0x1.0p-53);
+ volatile float yf = float(y);
+ volatile double yd = double(y);
+ ASSERT_FP_EQ(0x1.0p-53f, yf);
+ ASSERT_FP_EQ(0x1.0p-53, yd);
+
+ Float128 z = quick_add(x, y);
+
+ EXPECT_FP_EQ_ALL_ROUNDING(xf + yf, float(z));
+ EXPECT_FP_EQ_ALL_ROUNDING(xd + yd, double(z));
+}
+
+TEST(LlvmLibcDyadicFloatTest, QuickAdd) {
+ Float192 x(/*sign*/ false, /*exponent*/ 0,
+ /*mantissa*/ Float192::MantissaType(0x123456));
+ volatile double xd = double(x);
+ ASSERT_FP_EQ(0x1.23456p20, xd);
+
+ Float192 y(0x1.abcdefp-20);
+ volatile double yd = double(y);
+ ASSERT_FP_EQ(0x1.abcdefp-20, yd);
+
+ Float192 z = quick_add(x, y);
+
+ EXPECT_FP_EQ_ALL_ROUNDING(xd + yd, (volatile double)(z));
+}
+
+TEST(LlvmLibcDyadicFloatTest, QuickMul) {
+ Float256 x(/*sign*/ false, /*exponent*/ 0,
+ /*mantissa*/ Float256::MantissaType(0x123456));
+ volatile double xd = double(x);
+ ASSERT_FP_EQ(0x1.23456p20, xd);
+
+ Float256 y(0x1.abcdefp-25);
+ volatile double yd = double(y);
+ ASSERT_FP_EQ(0x1.abcdefp-25, yd);
+
+ Float256 z = quick_mul(x, y);
+
+ EXPECT_FP_EQ_ALL_ROUNDING(xd * yd, double(z));
+}
diff --git a/libc/utils/UnitTest/CMakeLists.txt b/libc/utils/UnitTest/CMakeLists.txt
index f2ec4e0ae6937..43039304907ed 100644
--- a/libc/utils/UnitTest/CMakeLists.txt
+++ b/libc/utils/UnitTest/CMakeLists.txt
@@ -33,7 +33,7 @@ add_library(
FPMatcher.h
)
target_include_directories(LibcFPTestHelpers PUBLIC ${LIBC_SOURCE_DIR})
-target_link_libraries(LibcFPTestHelpers LibcUnitTest)
+target_link_libraries(LibcFPTestHelpers LibcUnitTest libc_test_utils)
add_dependencies(
LibcFPTestHelpers
LibcUnitTest
diff --git a/libc/utils/UnitTest/FPMatcher.h b/libc/utils/UnitTest/FPMatcher.h
index 1c2b540675631..a7c0971839a3c 100644
--- a/libc/utils/UnitTest/FPMatcher.h
+++ b/libc/utils/UnitTest/FPMatcher.h
@@ -1,4 +1,4 @@
-//===-- TestMatchers.h ------------------------------------------*- C++ -*-===//
+//===-- FPMatchers.h --------------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,6 +12,7 @@
#include "src/__support/FPUtil/FEnvImpl.h"
#include "src/__support/FPUtil/FPBits.h"
#include "utils/UnitTest/Test.h"
+#include "utils/testutils/RoundingModeUtils.h"
#include <errno.h>
#include <math.h>
@@ -132,4 +133,17 @@ FPMatcher<T, C> getMatcher(T expectedValue) {
} \
} while (0)
+#define EXPECT_FP_EQ_ALL_ROUNDING(expected, actual) \
+ do { \
+ using namespace __llvm_libc::testutils; \
+ ForceRoundingMode __r1(RoundingMode::Nearest); \
+ EXPECT_FP_EQ((expected), (actual)); \
+ ForceRoundingMode __r2(RoundingMode::Upward); \
+ EXPECT_FP_EQ((expected), (actual)); \
+ ForceRoundingMode __r3(RoundingMode::Downward); \
+ EXPECT_FP_EQ((expected), (actual)); \
+ ForceRoundingMode __r4(RoundingMode::TowardZero); \
+ EXPECT_FP_EQ((expected), (actual)); \
+ } while (0)
+
#endif // LLVM_LIBC_UTILS_UNITTEST_FPMATCHER_H
More information about the libc-commits
mailing list