[libcxx-commits] [libcxx] [libc++][chrono] Adds the sys_info class. (PR #85619)

Louis Dionne via libcxx-commits libcxx-commits at lists.llvm.org
Tue Apr 9 09:20:50 PDT 2024


================
@@ -8,14 +8,712 @@
 
 // For information see https://libcxx.llvm.org/DesignDocs/TimeZone.html
 
+// TODO TZDB look at optimizations
+//
+// The current algorithm is correct but not efficient. For example, in a named
+// rule based continuation finding the next rule does quite a bit of work,
+// returns the next rule and "forgets" its state. This could be better.
+//
+// It would be possible to cache lookups. If a time for a zone is calculated its
+// sys_info could be kept and the next lookup could test whether the time is in
+// a "known" sys_info. The wording in the Standard hints at this slowness by
+// "suggesting" this could be implemented on the user's side.
+
+// TODO TZDB look at removing quirks
+//
+// The code has some special rules to adjust the timing at the continuation
+// switches. This works correctly, but some of the places feel odd. It would be
+// good to investigate this further and see whether all quirks are needed or
+// that there are better fixes.
+//
+// These quirks often use a 12h interval; this is the scan interval of zdump,
+// which implies there are no sys_info objects with a duration of less than 12h.
+
+#include <algorithm>
+#include <cctype>
 #include <chrono>
+#include <expected>
+#include <map>
+#include <ranges>
 
 #include "include/tzdb/time_zone_private.h"
+#include "include/tzdb/tzdb_list_private.h"
+
+// TODO TZDB remove debug printing
+#ifdef PRINT
+#  include <print>
+#endif
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+#ifdef PRINT
+template <>
+struct formatter<chrono::sys_info, char> {
+  template <class ParseContext>
+  constexpr typename ParseContext::iterator parse(ParseContext& ctx) {
+    return ctx.begin();
+  }
+
+  template <class FormatContext>
+  typename FormatContext::iterator format(const chrono::sys_info& info, FormatContext& ctx) const {
+    return std::format_to(
+        ctx.out(), "[{}, {}) {:%Q%q} {:%Q%q} {}", info.begin, info.end, info.offset, info.save, info.abbrev);
+  }
+};
+#endif
+
 namespace chrono {
 
+//===----------------------------------------------------------------------===//
+//                           Details
+//===----------------------------------------------------------------------===//
+
+struct __sys_info {
+  sys_info __info;
+  bool __can_merge; // Can the returned sys_info object be merged with
+};
+
+// Return type for helper function to get a sys_info.
+// - The expected result returns the "best" sys_info object. This object can be
+//   before the requested time. Sometimes sys_info objects from different
+//   continuations share their offset, save, and abbrev and these objects are
+//   merged to one sys_info object. The __can_merge flag determines whether the
+//   current result can be merged with the next result.
+// - The unexpected result means no sys_info object was found and the time is
+//   the time to be used for the next search iteration.
+using __sys_info_result = expected<__sys_info, sys_seconds>;
+
+template <ranges::forward_range _Range,
+          class _Type,
+          class _Proj                                                                                  = identity,
+          indirect_strict_weak_order<const _Type*, projected<ranges::iterator_t<_Range>, _Proj>> _Comp = ranges::less>
+[[nodiscard]] static ranges::borrowed_iterator_t<_Range>
+__binary_find(_Range&& __r, const _Type& __value, _Comp __comp = {}, _Proj __proj = {}) {
+  auto __end = ranges::end(__r);
+  auto __ret = ranges::lower_bound(ranges::begin(__r), __end, __value, __comp, __proj);
+  if (__ret == __end)
+    return __end;
+
+  // When the value does not match the predicate it's equal and a valid result
+  // was found.
+  return !std::invoke(__comp, __value, std::invoke(__proj, *__ret)) ? __ret : __end;
+}
+
+// Format based on https://data.iana.org/time-zones/tz-how-to.html
+//
+// 1  a time zone abbreviation that is a string of three or more characters that
+//    are either ASCII alphanumerics, "+", or "-"
+// 2  the string "%z", in which case the "%z" will be replaced by a numeric time
+//    zone abbreviation
+// 3  a pair of time zone abbreviations separated by a slash ('/'), in which
+//    case the first string is the abbreviation for the standard time name and
+//    the second string is the abbreviation for the daylight saving time name
+// 4  a string containing "%s", in which case the "%s" will be replaced by the
+//    text in the appropriate Rule's LETTER column, and the resulting string
+//    should be a time zone abbreviation
+//
+// Rule 1 is not strictly validated since America/Barbados uses a two letter
+// abbreviation AT.
+[[nodiscard]] static string
+__format(const __tz::__continuation& __continuation, const string& __letters, seconds __save) {
+  bool __shift = false;
+  string __result;
+  for (char __c : __continuation.__format) {
+    if (__shift) {
+      switch (__c) {
+      case 's':
+        std::ranges::copy(__letters, std::back_inserter(__result));
+        break;
+
+      case 'z': {
+        if (__continuation.__format.size() != 2)
+          std::__throw_runtime_error(
+              std::format("corrupt tzdb FORMAT field: %z should be the entire contents, instead contains '{}'",
+                          __continuation.__format)
+                  .c_str());
+        chrono::hh_mm_ss __offset{__continuation.__stdoff + __save};
+        if (__offset.is_negative()) {
+          __result += '-';
+          __offset = chrono::hh_mm_ss{-(__continuation.__stdoff + __save)};
+        } else
+          __result += '+';
+
+        if (__offset.minutes() != 0min)
+          std::format_to(std::back_inserter(__result), "{:%H%M}", __offset);
+        else
+          std::format_to(std::back_inserter(__result), "{:%H}", __offset);
+      } break;
+
+      default:
+        std::__throw_runtime_error(
+            std::format("corrupt tzdb FORMAT field: invalid sequence '%{}' found, expected %s or %z", __c).c_str());
+      }
+      __shift = false;
+
+    } else if (__c == '/') {
+      if (__save != 0s)
+        __result.clear();
+      else
+        break;
+
+    } else if (__c == '%') {
+      __shift = true;
+    } else if (__c == '+' || __c == '-' || std::isalnum(__c)) {
+      __result.push_back(__c);
+    } else {
+      std::__throw_runtime_error(
+          std::format(
+              "corrupt tzdb FORMAT field: invalid character '{}' found, expected +, -, or an alphanumeric value", __c)
+              .c_str());
+    }
+  }
+
+  if (__shift)
+    std::__throw_runtime_error("corrupt tzdb FORMAT field: input ended with the start of the escape sequence '%'");
+
+  if (__result.empty())
+    std::__throw_runtime_error("corrupt tzdb FORMAT field: result is empty");
+
+  return __result;
+}
+
+[[nodiscard]] static sys_seconds __to_sys_seconds(year_month_day __ymd, seconds __seconds) {
+  seconds __result = static_cast<sys_days>(__ymd).time_since_epoch() + __seconds;
+  return sys_seconds{__result};
+}
+
+[[nodiscard]] static seconds __at_to_sys_seconds(const __tz::__continuation& __continuation) {
+  switch (__continuation.__at.__clock) {
+  case __tz::__clock::__local:
+    return __continuation.__at.__time - __continuation.__stdoff -
+           std::visit(
+               [](const auto& __value) {
+                 using _Tp = decay_t<decltype(__value)>;
+                 if constexpr (same_as<_Tp, monostate>)
+                   return chrono::seconds{0};
+                 else if constexpr (same_as<_Tp, __tz::__save>)
+                   return chrono::duration_cast<seconds>(__value.__time);
+                 else if constexpr (same_as<_Tp, std::string>)
+                   // For a named rule based continuation the SAVE depends on the RULE
+                   // active at the end. This should be determined separately.
+                   return chrono::seconds{0};
+                 else
+                   static_assert(sizeof(_Tp) == 0); // TODO TZDB static_assert(false); after droping clang-16 support
+
+                 std::__libcpp_unreachable();
+               },
+               __continuation.__rules);
+
+  case __tz::__clock::__universal:
+    return __continuation.__at.__time;
+
+  case __tz::__clock::__standard:
+    return __continuation.__at.__time - __continuation.__stdoff;
+  }
+  std::__libcpp_unreachable();
+}
+
+[[nodiscard]] static year_month_day __to_year_month_day(year __year, month __month, __tz::__on __on) {
+  return std::visit(
+      [&](const auto& __value) {
+        using _Tp = decay_t<decltype(__value)>;
+        if constexpr (same_as<_Tp, chrono::day>)
+          return year_month_day{__year, __month, __value};
+        else if constexpr (same_as<_Tp, weekday_last>)
+          return year_month_day{static_cast<sys_days>(year_month_weekday_last{__year, __month, __value})};
+        else if constexpr (same_as<_Tp, __tz::__constrained_weekday>)
+          return __value(__year, __month);
+        else
+          static_assert(sizeof(_Tp) == 0); // TODO TZDB static_assert(false); after droping clang-16 support
+
+        std::__libcpp_unreachable();
+      },
+      __on);
+}
+
+[[nodiscard]] static sys_seconds __until_to_sys_seconds(const __tz::__continuation& __continuation) {
+  // Does UNTIL contain the magic value for the last continuation?
+  if (__continuation.__year == chrono::year::min())
+    return sys_seconds::max();
+
+  year_month_day __ymd = chrono::__to_year_month_day(__continuation.__year, __continuation.__in, __continuation.__on);
+  return chrono::__to_sys_seconds(__ymd, chrono::__at_to_sys_seconds(__continuation));
+}
+
+// Holds the UNTIL time for a continuation with a named rule.
+//
+// Unlike continuations with an fixed SAVE named rules have a variable SAVE.
+// This means when the UNTIL uses the local wall time the actual UNTIL value can
+// only be determined when the SAVE is known. This class holds that abstraction.
+class __named_rule_until {
+public:
+  explicit __named_rule_until(const __tz::__continuation& __continuation)
+      : __until_{chrono::__until_to_sys_seconds(__continuation)},
+        __needs_adjustment_{
+            // The last continuation of a ZONE has no UNTIL which basically is
+            // until the end of _local_ time. This value is expressed by
+            // sys_seconds::max(). Subtracting the SAVE leaves large value.
+            // However SAVE can be negative, which would add a value to maximum
+            // leading to undefined behaviour. In practice this often results in
+            // an overflow to a very small value.
+            __until_ != sys_seconds::max() && __continuation.__at.__clock == __tz::__clock::__local} {}
+
+  // Gives the unadjusted until value, this is useful when the SAVE is not known
+  // at all.
+  sys_seconds __until() const noexcept { return __until_; }
+
+  bool __needs_adjustment() const noexcept { return __needs_adjustment_; }
+
+  // Returns the UNTIL adjusted for SAVE.
+  sys_seconds operator()(seconds __save) const noexcept { return __until_ - __needs_adjustment_ * __save; }
+
+private:
+  sys_seconds __until_;
+  bool __needs_adjustment_;
+};
+
+[[nodiscard]] static seconds __at_to_seconds(seconds __stdoff, const __tz::__rule& __rule) {
+  switch (__rule.__at.__clock) {
+  case __tz::__clock::__local:
+    // Local time and standard time behave the same. This is not
+    // correct. Local time needs to adjust for the current saved time.
+    // To know the saved time the rules need to be known and sorted.
+    // This needs a time so to avoid the chicken and egg adjust the
+    // saving of the local time later.
+    return __rule.__at.__time - __stdoff;
+
+  case __tz::__clock::__universal:
+    return __rule.__at.__time;
+
+  case __tz::__clock::__standard:
+    return __rule.__at.__time - __stdoff;
+  }
+  std::__libcpp_unreachable();
+}
+
+[[nodiscard]] static sys_seconds __from_to_sys_seconds(seconds __stdoff, const __tz::__rule& __rule, year __year) {
+  year_month_day __ymd = chrono::__to_year_month_day(__year, __rule.__in, __rule.__on);
+
+  seconds __at = chrono::__at_to_seconds(__stdoff, __rule);
+  return chrono::__to_sys_seconds(__ymd, __at);
+}
+
+[[nodiscard]] static sys_seconds __from_to_sys_seconds(seconds __stdoff, const __tz::__rule& __rule) {
+  return chrono::__from_to_sys_seconds(__stdoff, __rule, __rule.__from);
+}
+
+[[nodiscard]] static const vector<__tz::__rule>&
+__get_rules(const __tz::__rules_storage_type& __rules_db, const string& __rule_name) {
+  auto __result = chrono::__binary_find(__rules_db, __rule_name, {}, [](const auto& __p) { return __p.first; });
+  if (__result == std::end(__rules_db))
+    std::__throw_runtime_error(("corrupt tzdb: rule '" + __rule_name + " 'does not exist").c_str());
+
+  return __result->second;
+}
+
+// Returns the letters field for a time before the first rule.
+//
+// Per https://data.iana.org/time-zones/tz-how-to.html
+// One wrinkle, not fully explained in zic.8.txt, is what happens when switching
+// to a named rule. To what values should the SAVE and LETTER data be
+// initialized?
+//
+// 1 If at least one transition has happened, use the SAVE and LETTER data from
+//   the most recent.
+// 2 If switching to a named rule before any transition has happened, assume
+//   standard time (SAVE zero), and use the LETTER data from the earliest
+//   transition with a SAVE of zero.
+//
+// This function implements case 2.
+[[nodiscard]] static string __letters_before_first_rule(const vector<__tz::__rule>& __rules) {
+  auto __letters =
+      __rules                                                                                //
+      | views::filter([](const __tz::__rule& __rule) { return __rule.__save.__time == 0s; }) //
+      | views::transform([](const __tz::__rule& __rule) { return __rule.__letters; })        //
+      | views::take(1);
+
+  if (__letters.empty())
+    std::__throw_runtime_error("corrupt tzdb: rule has zero entries");
+
+  return __letters.front();
+}
+
+// Determines the information based on the continuation and the rules.
+//
+// There are several special cases to take into account
+//
+// === Entries before the first rule becomes active ===
+// Asia/Hong_Kong
+//   9 - JST 1945 N 18 2        // (1)
+//   8 HK HK%sT                 // (2)
+//   R HK 1946 o - Ap 21 0 1 S  // (3)
+// There (1) is active until Novemer 18th 1945 at 02:00, after this time
+// (2) becomes active. The first rule entry for HK (3) becomes active
+// from April 21st 1945 at 01:00. In the period between (2) is active.
+// This entry has an offset.
+// This entry has no save, letters, or dst flag. So in the period
+// after (1) and until (3) no rule entry is associated with the time.
+
+[[nodiscard]] static sys_info __get_sys_info_before_first_rule(
+    sys_seconds __begin,
+    sys_seconds __end,
+    const __tz::__continuation& __continuation,
+    const vector<__tz::__rule>& __rules) {
+  return sys_info{
+      __begin,
+      __end,
+      __continuation.__stdoff,
+      chrono::minutes(0),
+      chrono::__format(__continuation, __letters_before_first_rule(__rules), 0s)};
+}
+
+// Returns the sys_info object for a time before the first rule.
+// When this first rule has a SAVE of 0s the sys_info for the time before the
+// first rule and for the first rule are identical and will be merged.
+[[nodiscard]] static sys_info __get_sys_info_before_first_rule(
+    sys_seconds __begin,
+    sys_seconds __rule_end, // The end used when SAVE != 0s
+    sys_seconds __next_end, // The end used when SAVE == 0s the times are merged
+    const __tz::__continuation& __continuation,
+    const vector<__tz::__rule>& __rules,
+    vector<__tz::__rule>::const_iterator __rule) {
+  if (__rule->__save.__time != 0s)
+    return __get_sys_info_before_first_rule(__begin, __rule_end, __continuation, __rules);
+
+  return sys_info{
+      __begin, __next_end, __continuation.__stdoff, 0min, chrono::__format(__continuation, __rule->__letters, 0s)};
+}
+
+[[nodiscard]] static seconds __at_to_seconds(seconds __stdoff, seconds __save, const __tz::__rule& __rule) {
+  switch (__rule.__at.__clock) {
+  case __tz::__clock::__local:
+    return __rule.__at.__time - __stdoff - __save;
+
+  case __tz::__clock::__universal:
+    return __rule.__at.__time;
+
+  case __tz::__clock::__standard:
+    return __rule.__at.__time - __stdoff;
+  }
+  std::__libcpp_unreachable();
+}
+
+[[nodiscard]] static sys_seconds
+__rule_to_sys_seconds(seconds __stdoff, seconds __save, const __tz::__rule& __rule, year __year) {
+  year_month_day __ymd = chrono::__to_year_month_day(__year, __rule.__in, __rule.__on);
+
+  seconds __at = chrono::__at_to_seconds(__stdoff, __save, __rule);
+  return chrono::__to_sys_seconds(__ymd, __at);
+}
+
+// Returns the first rule after __time.
+// Note that a rule can be "active" in multiple years, this may result in an
+// infinite loop where the same rule is returned every time, use __current to
+// guard against that.
+//
+// When no next rule exists the returned time will be sys_seconds::max(). This
+// can happen in practice. For example,
+//
+//   R So 1945 o - May 24 2 2 M
+//   R So 1945 o - S 24 3 1 S
+//   R So 1945 o - N 18 2s 0 -
+//
+// Has 3 rules that are all only active in 1945.
+[[nodiscard]] static pair<sys_seconds, vector<__tz::__rule>::const_iterator>
+__next_rule(sys_seconds __time,
+            seconds __stdoff,
+            seconds __save,
+            const vector<__tz::__rule>& __rules,
+            vector<__tz::__rule>::const_iterator __current) {
+  year __year = year_month_day{chrono::floor<days>(__time)}.year();
+
+  // Note it would probably be better to store the pairs in a vector and then
+  // use min() to get the smallest element
+  map<sys_seconds, vector<__tz::__rule>::const_iterator> __candidates;
+  // Note this evaluates all rules which is a waste of effort; when the entries
+  // are beyond the current year's "next year" (where "next year" is not always
+  // year + 1) the algorithm should end.
+  for (auto __it = __rules.begin(); __it != __rules.end(); ++__it) {
+    for (year __y = __it->__from; __y <= __it->__to; ++__y) {
+      // Adding the current entry for the current year may lead to infinite
+      // loops due to the SAVE adjustment. Skip these entries.
+      if (__y == __year && __it == __current)
+        continue;
+
+      sys_seconds __t = chrono::__rule_to_sys_seconds(__stdoff, __save, *__it, __y);
+      if (__t <= __time)
+        continue;
+
+      _LIBCPP_ASSERT_INTERNAL(!__candidates.contains(__t), "duplicated rule");
----------------
ldionne wrote:

I would argue this should be `_LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN`, since the tzdb is kind of like an "argument" to the library, and it being malformed is another way of saying "it's not within its domain". At the very least I feel like this isn't an internal assertion. This also applies to the other assertions, probably.

https://github.com/llvm/llvm-project/pull/85619


More information about the libcxx-commits mailing list