[libcxx-commits] [libc] [libcxx] [llvm] [libcxx][libc] Hand in Hand PoC with from_chars (PR #91651)

Tue Sep 3 13:14:32 PDT 2024

================
@@ -0,0 +1,491 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_SRC_INCLUDE_FROM_CHARS_FLOATING_POINT_H
+#define _LIBCPP_SRC_INCLUDE_FROM_CHARS_FLOATING_POINT_H
+
+// These headers are in the shared LLVM-libc header library.
+#include "shared/fp_bits.h"
+#include "shared/str_to_float.h"
+#include "shared/str_to_integer.h"
+
+#include <__assert>
+#include <__config>
+#include <cctype>
+#include <charconv>
+#include <concepts>
+#include <limits>
+#include <optional>
+#include <type_traits>
+
+// Included for the _Floating_type_traits class
+#include "to_chars_floating_point.h"
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+// Parses an infinity string.
+// Valid strings are case insentitive and contain INF or INFINITY.
+//
+// - __first is the first argument to std::from_chars. When the string is invalid
+//   this value is returned as ptr in the result.
+// - __last is the last argument of std::from_chars.
+// - __value is the value argument of std::from_chars,
+// - __ptr is the current position is the input string. This is points beyond
+//   the initial I character.
+// - __negative whether a valid string represents -inf or +inf.
+template <floating_point _Fp>
+from_chars_result __from_chars_floating_point_inf(
+    const char* const __first, const char* __last, _Fp& __value, const char* __ptr, bool __negative) {
+  if (__last - __ptr < 2) [[unlikely]]
+    return {__first, errc::invalid_argument};
+
+  if (std::tolower(__ptr[0]) != 'n' || std::tolower(__ptr[1]) != 'f') [[unlikely]]
+    return {__first, errc::invalid_argument};
+
+  __ptr += 2;
+
+  // At this point the result is valid and contains INF.
+  // When the remaining part contains INITY this will be consumed. Otherwise
+  // only INF is consumed. For example INFINITZ will consume INF and ignore
+  // INITZ.
+
+  if (__last - __ptr >= 5              //
+      && std::tolower(__ptr[0]) == 'i' //
+      && std::tolower(__ptr[1]) == 'n' //
+      && std::tolower(__ptr[2]) == 'i' //
+      && std::tolower(__ptr[3]) == 't' //
+      && std::tolower(__ptr[4]) == 'y')
+    __ptr += 5;
+
+  if constexpr (numeric_limits<_Fp>::has_infinity) {
+    if (__negative)
+      __value = -std::numeric_limits<_Fp>::infinity();
+    else
+      __value = std::numeric_limits<_Fp>::infinity();
+
+    return {__ptr, std::errc{}};
+  } else {
+    return {__ptr, errc::result_out_of_range};
+  }
+}
+
+// Parses a nan string.
+// Valid strings are case insentitive and contain INF or INFINITY.
+//
+// - __first is the first argument to std::from_chars. When the string is invalid
+//   this value is returned as ptr in the result.
+// - __last is the last argument of std::from_chars.
+// - __value is the value argument of std::from_chars,
+// - __ptr is the current position is the input string. This is points beyond
+//   the initial N character.
+// - __negative whether a valid string represents -nan or +nan.
+template <floating_point _Fp>
+from_chars_result __from_chars_floating_point_nan(
+    const char* const __first, const char* __last, _Fp& __value, const char* __ptr, bool __negative) {
+  if (__last - __ptr < 2) [[unlikely]]
+    return {__first, errc::invalid_argument};
+
+  if (std::tolower(__ptr[0]) != 'a' || std::tolower(__ptr[1]) != 'n') [[unlikely]]
+    return {__first, errc::invalid_argument};
+
+  __ptr += 2;
+
+  // At this point the result is valid and contains NAN. When the remaining
+  // part contains ( n-char-sequence_opt ) this will be consumed. Otherwise
+  // only NAN is consumed. For example NAN(abcd will consume NAN and ignore
+  // (abcd.
+  if (__last - __ptr >= 2 && __ptr[0] == '(') {
+    size_t __offset = 1;
+    do {
+      if (__ptr[__offset] == ')') {
+        __ptr += __offset + 1;
+        break;
+      }
+      if (__ptr[__offset] != '_' && !std::isalnum(__ptr[__offset]))
+        break;
+      ++__offset;
+    } while (__ptr + __offset != __last);
+  }
+
+  if (__negative)
+    __value = -std::numeric_limits<_Fp>::quiet_NaN();
+  else
+    __value = std::numeric_limits<_Fp>::quiet_NaN();
+
+  return {__ptr, std::errc{}};
+}
+
+template <class _Tp>
+struct __fractional_constant_result {
+  size_t __offset{size_t(-1)};
+  _Tp __mantissa{0};
+  int __exponent{0};
+  bool __truncated{false};
+  bool __valid{false};
+};
+
+template <class _Tp>
+__fractional_constant_result<_Tp> __parse_fractional_hex_constant(const char* __input, size_t __n, size_t __offset) {
+  __fractional_constant_result<_Tp> __result;
+
+  const _Tp __mantissa_truncate_threshold = numeric_limits<_Tp>::max() / 16;
+  bool __fraction                         = false;
+  for (; __offset < __n; ++__offset) {
+    if (std::isxdigit(__input[__offset])) {
+      __result.__valid = true;
+
+      uint32_t __digit = __input[__offset] - '0';
+      switch (std::tolower(__input[__offset])) {
+      case 'a':
+        __digit = 10;
+        break;
+      case 'b':
+        __digit = 11;
+        break;
+      case 'c':
+        __digit = 12;
+        break;
+      case 'd':
+        __digit = 13;
+        break;
+      case 'e':
+        __digit = 14;
+        break;
+      case 'f':
+        __digit = 15;
+        break;
+      }
+
+      if (__result.__mantissa < __mantissa_truncate_threshold) {
+        __result.__mantissa = (__result.__mantissa * 16) + __digit;
+        if (__fraction)
+          __result.__exponent -= 4;
+      } else {
+        if (__digit > 0)
+          __result.__truncated = true;
+        if (!__fraction)
+          __result.__exponent += 4;
+      }
+    } else if (__input[__offset] == '.') {
+      if (__fraction)
+        break; // this means that __input[__offset] points to a second decimal point, ending the number.
+
+      __fraction = true;
+    } else
+      break;
+  }
+
+  __result.__offset = __offset;
+  return __result;
+}
+
+// Here we do this operation as int64 to avoid overflow.
+int32_t __merge_exponents(int64_t __fractional, int64_t __exponent, int __max_biased_exponent) {
+  int64_t __sum = __fractional + __exponent;
+
+  if (__sum > __max_biased_exponent)
+    return __max_biased_exponent;
+
+  if (__sum < -__max_biased_exponent)
+    return -__max_biased_exponent;
+
+  return __sum;
+}
+
+template <floating_point _Fp>
+from_chars_result __from_chars_floating_point_hex(
+    const char* const __first, const char* __last, _Fp& __value, const char* __ptr, bool __negative) {
+  size_t __n      = __last - __first;
+  size_t __offset = __ptr - __first;
+
+  auto __fractional =
+      __parse_fractional_hex_constant<typename _Floating_type_traits<_Fp>::_Uint_type>(__first, __n, __offset);
+  if (!__fractional.__valid)
+    return {__first, errc::invalid_argument};
+
+  __offset = __fractional.__offset;
+
+  optional<int> __exponent;
+  if (__offset + 1 < __n && // an exponent always needs at least one digit.
+      std::tolower(__first[__offset]) == 'p') {
+    ++__offset; // assumes a valid exponent.
+    LIBC_NAMESPACE::shared::StrToNumResult<int32_t> __e =
+        LIBC_NAMESPACE::shared::strtointeger<int32_t>(__first + __offset, 10, __n - __offset);
+    // __result.error contains the errno value, 0 or ERANGE these are not interesting.
+    // If the number of characters parsed is 0 it means there was no number.
+    if (__e.parsed_len != 0) {
+      __offset += __e.parsed_len;
+      __exponent = __merge_exponents(
+          __fractional.__exponent, __e.value, LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT);
+    } else
+      --__offset; // the assumption of a valid exponent was not true, undo eating the exponent character.
+  }
+
+  if (!__exponent)
+    __exponent =
+        __merge_exponents(__fractional.__exponent, 0, LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT);
+
+  LIBC_NAMESPACE::shared::ExpandedFloat<_Fp> expanded_float = {0, 0};
+  errc status{};
+  if (__fractional.__mantissa != 0) {
+    auto temp = LIBC_NAMESPACE::shared::binary_exp_to_float<_Fp>(
+        {__fractional.__mantissa, *__exponent},
+        __fractional.__truncated,
+        LIBC_NAMESPACE::shared::RoundDirection::Nearest);
+    expanded_float = temp.num;
+    if (temp.error == ERANGE) {
+      status = errc::result_out_of_range;
+    }
+  }
+
+  auto result = LIBC_NAMESPACE::shared::FPBits<_Fp>();
+  result.set_mantissa(expanded_float.mantissa);
+  result.set_biased_exponent(expanded_float.exponent);
+
+  // C17 7.12.1/6
+  // The result underflows if the magnitude of the mathematical result is so
+  // small that the mathematical re- sult cannot be represented, without
+  // extraordinary roundoff error, in an object of the specified type.237) If
+  // the result underflows, the function returns an implementation-defined
+  // value whose magnitude is no greater than the smallest normalized positive
+  // number in the specified type; if the integer expression math_errhandling
+  // & MATH_ERRNO is nonzero, whether errno acquires the value ERANGE is
+  // implementation-defined; if the integer expression math_errhandling &
+  // MATH_ERREXCEPT is nonzero, whether the "underflow" floating-point
+  // exception is raised is implementation-defined.
+  //
+  // LLVM-LIBC sets ERAGNE for subnormal values
+  //
+  // [charconv.from.chars]/1
+  //   ... If the parsed value is not in the range representable by the type of
+  //   value, value is unmodified and the member ec of the return value is
+  //   equal to errc::result_out_of_range. ...
+  //
+  // Undo the ERANGE for subnormal values.
+  if (status == errc::result_out_of_range && result.is_subnormal() && !result.is_zero())
+    status = errc{};
+
+  if (__negative)
+    __value = -result.get_val();
+  else
+    __value = result.get_val();
+
+  return {__first + __offset, status};
+}
+
+template <floating_point _Fp>
+from_chars_result __from_chars_floating_point_decimal(
+    const char* const __first,
+    const char* __last,
+    _Fp& __value,
+    chars_format __fmt,
+    const char* __ptr,
+    bool __negative) {
+  using _Traits    = _Floating_type_traits<_Fp>;
+  using _Uint_type = typename _Traits::_Uint_type;
+
+  const char* src  = __ptr; // rename to match the libc code copied for this section.
+  ptrdiff_t length = __last - src;
+  _LIBCPP_ASSERT_INTERNAL(length > 0, "Last must be after start");
+
+  _Uint_type mantissa            = 0;
+  int exponent                   = 0;
+  bool truncated                 = false;
+  bool seen_digit                = false;
+  bool has_valid_exponent        = false;
+  bool after_decimal             = false;
+  size_t index                   = 0;
+  const size_t BASE              = 10;
+  constexpr char EXPONENT_MARKER = 'e';
+  constexpr char DECIMAL_POINT   = '.';
+
+  // The loop fills the mantissa with as many digits as it can hold
+  const _Uint_type bitstype_max_div_by_base = numeric_limits<_Uint_type>::max() / BASE;
+
+  while (index < static_cast<size_t>(length)) {
+    if (std::isdigit(src[index])) {
+      uint32_t digit = src[index] - '0';
+      seen_digit     = true;
+
+      if (mantissa < bitstype_max_div_by_base) {
+        mantissa = (mantissa * BASE) + digit;
+        if (after_decimal) {
+          --exponent;
+        }
+      } else {
+        if (digit > 0)
+          truncated = true;
+        if (!after_decimal)
+          ++exponent;
+      }
+
+      ++index;
+      continue;
+    }
+    if (src[index] == DECIMAL_POINT) {
+      if (after_decimal) {
+        break; // this means that src[index] points to a second decimal point, ending the number.
+      }
+      after_decimal = true;
+      ++index;
+      continue;
+    }
+    // The character is neither a digit nor a decimal point.
+    break;
+  }
+
+  if (!seen_digit)
+    return {__first, errc::invalid_argument};
+
+  // LWG3456 Pattern used by std::from_chars is underspecified
+  // This changes fixed to ignore a possible exponent instead of making its
+  // existance an error.
+  if (__fmt != chars_format::fixed && index < static_cast<size_t>(length) &&
+      std::tolower(src[index]) == EXPONENT_MARKER) {
+    bool has_sign = false;
+    if (index + 1 < static_cast<size_t>(length) && (src[index + 1] == '+' || src[index + 1] == '-')) {
+      has_sign = true;
+    }
+    if (index + 1 + static_cast<size_t>(has_sign) < static_cast<size_t>(length) &&
+        std::isdigit(src[index + 1 + static_cast<size_t>(has_sign)])) {
+      has_valid_exponent = true;
+      ++index;
+      auto result = LIBC_NAMESPACE::shared::strtointeger<int32_t>(src + index, 10, static_cast<size_t>(length) - index);
+      // if (result.has_error())
+      //   output.error = result.error;
+      int32_t add_to_exponent = result.value;
+      index += result.parsed_len;
+
+      // Here we do this operation as int64 to avoid overflow.
+      int64_t temp_exponent = static_cast<int64_t>(exponent) + static_cast<int64_t>(add_to_exponent);
+
+      // If the result is in the valid range, then we use it. The valid range is
+      // also within the int32 range, so this prevents overflow issues.
+      if (temp_exponent > LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT) {
+        exponent = LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT;
+      } else if (temp_exponent < -LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT) {
+        exponent = -LIBC_NAMESPACE::shared::FPBits<_Fp>::MAX_BIASED_EXPONENT;
+      } else {
+        exponent = static_cast<int32_t>(temp_exponent);
+      }
----------------
michaelrj-google wrote:

This should probably use `__merge_exponents`

https://github.com/llvm/llvm-project/pull/91651