[libc-commits] [libc] [libc][wctype] Add perfect hash map for conversion functions (PR #187670)

Michael Jones via libc-commits libc-commits at lists.llvm.org
Wed Apr 29 11:46:55 PDT 2026


================
@@ -0,0 +1,880 @@
+//===-- Perfect hash map for conversion functions ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_WCTYPE_PERFECT_HASH_MAP_H
+#define LLVM_LIBC_SRC___SUPPORT_WCTYPE_PERFECT_HASH_MAP_H
+
+#define LIBC_ENABLE_CONSTEXPR 1
+
+#include "hdr/types/size_t.h"
+#include "hdr/types/wint_t.h"
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/expected.h"
+#include "src/__support/CPP/optional.h"
+#include "src/__support/CPP/span.h"
+#include "src/__support/CPP/string.h"
+#include "src/__support/CPP/tuple.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/math/ceil.h"
+#include "src/__support/math/log.h"
+#include "src/__support/uint128.h"
+
+#ifdef DEBUGDEBUG
+#include "src/__support/OSUtil/io.h"
+#endif
+
+#undef LIBC_ENABLE_CONSTEXPR
+
+namespace LIBC_NAMESPACE_DECL {
+namespace wctype_internal {
+
+namespace ptrhash {
+
+LIBC_INLINE_VAR constexpr size_t SHARDS = 1;
+
+class FastRand {
+public:
+  // This seed value is very important for different inputs. Bad values are
+  // known to cause compilation errors and/or incorrect computations in some
+  // cases. Defaulted to 0xEF6F79ED30BA75A in the original implementation, but
+  // this is not sufficient. 0x64a727ea04c46a32 is another viable seed.
+  LIBC_INLINE constexpr FastRand() : seed(0xeec13c9f1362aa74) {}
+
+  LIBC_INLINE constexpr uint8_t gen_byte() {
+    return static_cast<uint8_t>(this->gen());
+  }
+
+  LIBC_INLINE constexpr uint64_t gen() {
+    constexpr uint64_t WY_CONST_0 = 0x2d35'8dcc'aa6c'78a5;
+    constexpr uint64_t WY_CONST_1 = 0x8bb8'4b93'962e'acc9;
+
+    const uint64_t s = wrapping_add(seed, WY_CONST_0);
+    seed = s;
+    const UInt128 t =
+        static_cast<UInt128>(s) * static_cast<UInt128>(s ^ WY_CONST_1);
+    return static_cast<uint64_t>(t) ^ static_cast<uint64_t>(t >> 64);
+  }
+
+private:
+  template <typename T> LIBC_INLINE static constexpr T wrapping_add(T a, T b) {
+    while (b != 0) {
+      T carry = a & b;
+      a = a ^ b;
+      b = carry << 1;
+    }
+    return a;
+  }
+
+private:
+  uint64_t seed;
+};
+
+LIBC_INLINE_VAR constexpr auto BUCKET_IDX_NONE = ~static_cast<uint32_t>(0);
+
+template <size_t MaxSize = 5> class BinaryHeap {
+public:
+  LIBC_INLINE constexpr BinaryHeap() = default;
+
+  LIBC_INLINE constexpr void push(const cpp::tuple<size_t, uint32_t> &value) {
+    if (current_size >= MaxSize)
+      return;
+    data[current_size++] = value;
+  }
+
+  LIBC_INLINE constexpr cpp::tuple<size_t, uint32_t> pop() {
+    if (current_size == 0)
+      return {};
+    size_t max_idx = 0;
+    for (size_t i = 1; i < current_size; ++i) {
+      if (cmp(data[max_idx], data[i]))
+        max_idx = i;
+    }
+    auto top = data[max_idx];
+    data[max_idx] = data[current_size - 1];
+    --current_size;
+    return top;
+  }
+
+  LIBC_INLINE constexpr const cpp::tuple<size_t, uint32_t> &peek() const {
+    size_t max_idx = 0;
+    for (size_t i = 1; i < current_size; ++i) {
+      if (cmp(data[max_idx], data[i]))
+        max_idx = i;
+    }
+    return data[max_idx];
+  }
+
+  LIBC_INLINE constexpr bool empty() const { return current_size == 0; }
+
+private:
+  LIBC_INLINE static constexpr bool cmp(cpp::tuple<size_t, uint32_t> x,
+                                        cpp::tuple<size_t, uint32_t> y) {
+    return cpp::get<0>(x) < cpp::get<0>(y) ||
+           (!(cpp::get<0>(y) < cpp::get<0>(x)) &&
+            cpp::get<1>(x) < cpp::get<1>(y));
+  }
+
+private:
+  cpp::array<cpp::tuple<size_t, uint32_t>, MaxSize> data{};
+  size_t current_size{};
+};
+
+template <typename T> LIBC_INLINE constexpr bool is_power_of_two(T x) {
+  static_assert(cpp::is_unsigned_v<T>,
+                "is_power_of_two requires unsigned type");
+  return x != 0 && (x & (x - 1)) == 0;
+}
+
+// Formula of Vigna, eps-cost-sharding: https://arxiv.org/abs/2503.18397
+// (1-alpha)/2, so that on average we still have some room to play with.
+LIBC_INLINE constexpr size_t get_parts(size_t n) {
+  size_t parts = 0;
+  const double eps = 0.01 / 2.0; // alpha here is 0.99 for linear configuration
+  const double x = static_cast<double>(n) * eps * eps / 2.0;
+  const size_t target_parts = static_cast<size_t>(x / math::log(x));
+  // could be double or size_t depending on SHARDS value, so kept as auto.
+  const auto parts_per_shard = target_parts / SHARDS;
+  parts = ((parts_per_shard > 1) ? parts_per_shard : 1) * SHARDS;
+  return parts;
+}
+
+LIBC_INLINE constexpr size_t get_slots_per_part(size_t keys_per_part) {
+  size_t slots_per_part =
+      static_cast<size_t>(static_cast<double>(keys_per_part) / 0.99);
+  if (is_power_of_two(slots_per_part)) {
+    slots_per_part += 1;
+  }
+  return slots_per_part;
+}
+
+template <size_t n> class PtrhashConfig {
+public:
+  static constexpr size_t PARTS = get_parts(n);
+  static constexpr size_t KEYS_PER_PART = n / PARTS;
+  static constexpr size_t PARTS_PER_SHARD = PARTS / SHARDS;
+  static constexpr size_t SLOTS_PER_PART = get_slots_per_part(KEYS_PER_PART);
+  static constexpr size_t SLOTS_TOTAL = PARTS * SLOTS_PER_PART;
+  static constexpr size_t BUCKETS_PER_PART =
+      math::ceil(KEYS_PER_PART / 3.0) + 3;
+  static constexpr size_t BUCKETS_TOTAL = PARTS * BUCKETS_PER_PART;
+};
+
+// fxhash algorithm constant used in hashing numbers. Chosen for good randomness
+// properties and to ensure stable, deterministic, and well-distributed outputs.
+LIBC_INLINE_VAR constexpr uint64_t FXHASH_SEED = 0x517cc1b727220a95;
+
+template <size_t n_, typename Key = wint_t,
+          size_t parts_ = PtrhashConfig<n_>::PARTS,
+          size_t parts_per_shard_ = PtrhashConfig<n_>::PARTS_PER_SHARD,
+          size_t slots_total_ = PtrhashConfig<n_>::SLOTS_TOTAL,
+          size_t buckets_total_ = PtrhashConfig<n_>::BUCKETS_TOTAL,
+          size_t slots_ = PtrhashConfig<n_>::SLOTS_PER_PART,
+          size_t buckets_ = PtrhashConfig<n_>::BUCKETS_PER_PART,
+          typename F = cpp::array<uint32_t, slots_total_ - n_>,
+          typename PilotsTypeV = cpp::array<uint8_t, buckets_total_>>
+class PtrHash {
+public:
+  static_assert(
+      cpp::is_same_v<PilotsTypeV, cpp::span<uint8_t>> ||
+          cpp::is_same_v<PilotsTypeV, cpp::array<uint8_t, buckets_total_>>,
+      "V must be a byte slice or byte vector");
+
+  LIBC_INLINE constexpr PtrHash(uint64_t seed_, PilotsTypeV pilots_, F remap_)
+      : seed(seed_), pilots(pilots_), remap(remap_) {}
+
+  LIBC_INLINE constexpr PtrHash() = default;
+  LIBC_INLINE constexpr PtrHash(const PtrHash &) = default;
+  LIBC_INLINE constexpr PtrHash(PtrHash &&) = default;
+
+  LIBC_INLINE constexpr PtrHash &operator=(const PtrHash &) = default;
+  LIBC_INLINE constexpr PtrHash &operator=(PtrHash &&) = default;
+
+  LIBC_INLINE constexpr size_t index(Key key) const {
+    auto slot = this->index_no_remap(key);
+
+    if (slot < n_) {
+      return slot;
+    }
+
+    return this->remap[slot - n_];
+  }
+
+  LIBC_INLINE constexpr size_t index_no_remap(Key key) const {
+    auto hx = this->hash_key(key);
+    auto b = this->bucket(hx);
+    auto pilot = this->pilots[b];
+    return this->slot(hx, pilot);
+  }
+
+  LIBC_INLINE constexpr size_t slot(uint64_t hx, uint64_t pilot) const {
+    return (this->part(hx) * slots_) + this->slot_in_part(hx, pilot);
+  }
+
+  LIBC_INLINE constexpr size_t slot_in_part(uint64_t hx, uint64_t pilot) const {
+    return this->slot_in_part_hp(hx, this->hash_pilot(pilot));
+  }
+
+  LIBC_INLINE constexpr cpp::optional<cpp::tuple<uint64_t, PilotsTypeV, F>>
+  compute_pilots(const cpp::array<Key, n_> &keys) {
+    cpp::array<cpp::array<bool, slots_>, parts_> taken{};
+    for (cpp::array<bool, slots_> &t : taken) {
+      for (size_t i = 0; i < slots_; i++) {
+        t[i] = 0;
+      }
+    }
----------------
michaelrj-google wrote:

nit: match below, remove extra braces

https://github.com/llvm/llvm-project/pull/187670


More information about the libc-commits mailing list