[libc-commits] [libc] [libc] [search] implement hcreate(_r)/hsearch(_r)/hdestroy(_r) (PR #73469)

Nick Desaulniers via libc-commits libc-commits at lists.llvm.org
Mon Nov 27 13:24:34 PST 2023


================
@@ -0,0 +1,235 @@
+//===-- Fix-sized Monotonic HashTable ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
+#define LLVM_LIBC_SRC___SUPPORT_HASHTABLE_table_H
+
+#include "include/llvm-libc-types/ENTRY.h"
+#include "src/__support/CPP/new.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/HashTable/bitmask.h"
+#include "src/__support/bit.h"
+#include "src/__support/hash.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/memory_size.h"
+#include "src/string/memset.h"
+#include "src/string/strcmp.h"
+#include "src/string/strlen.h"
+#include <stddef.h>
+#include <stdint.h>
+
+namespace LIBC_NAMESPACE {
+namespace internal {
+
+LIBC_INLINE uint8_t secondary_hash(uint64_t hash) {
+  // top 7 bits of the hash.
+  return static_cast<uint8_t>((hash >> 57) & 0x7f);
+}
+
+// Probe sequence based on triangular numbers, which is guaranteed (since our
+// table size is a power of two) to visit every group of elements exactly once.
+//
+// A triangular probe has us jump by 1 more group every time. So first we
+// jump by 1 group (meaning we just continue our linear scan), then 2 groups
+// (skipping over 1 group), then 3 groups (skipping over 2 groups), and so on.
+//
+// If we set sizeof(Group) to be one unit:
+//               T[k] = sum {1 + 2 + ... + k} = k * (k + 1) / 2
+// It is provable that T[k] mod 2^m generates a permutation of
+//                0, 1, 2, 3, ..., 2^m - 2, 2^m - 1
+// Detailed proof is available at:
+// https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/
+struct ProbeSequence {
+  size_t position;
+  size_t stride;
+  size_t entries_mask;
+
+  LIBC_INLINE size_t next() {
+    position += stride;
+    position &= entries_mask;
+    stride += sizeof(Group);
+    return position;
+  }
+};
+
+// The number of entries is at least group width: we do not
+// need to do the fixup when we set the control bytes.
+// The number of entries is at least 8: we don't have to worry
+// about special sizes when check the fullness of the table.
+LIBC_INLINE size_t capacity_to_entries(size_t cap) {
+  if (8 >= sizeof(Group) && cap < 8)
+    return 8;
+  if (16 >= sizeof(Group) && cap < 15)
+    return 16;
+  if (cap < sizeof(Group))
+    cap = sizeof(Group);
+  // overflow is always checked in allocate()
+  return next_power_of_two(cap * 8 / 7);
+}
+
+// The heap memory layout for N buckets HashTable is as follows:
+//
+//             =======================
+//             |   N * Entry         |
+//             ======================= <- align boundary
+//             |   Header            |
+//             =======================
+//             |   (N + 1) * Byte    |
+//             =======================
+//
+// The trailing group part is to make sure we can always load
+// a whole group of control bytes.
+
+struct HashTable {
+  HashState state;
+  size_t entries_mask;    // number of buckets - 1
+  size_t available_slots; // less than capacity
+private:
+  // How many entries are there in the table.
+  LIBC_INLINE size_t num_of_entries() const { return entries_mask + 1; }
+
+  LIBC_INLINE bool is_full() const { return available_slots == 0; }
+
+  LIBC_INLINE size_t offset_from_entries() const {
+    size_t entries_size = num_of_entries() * sizeof(ENTRY);
+    return entries_size + offset_to(entries_size, table_alignment());
+  }
+
+  LIBC_INLINE constexpr static size_t table_alignment() {
+    return alignof(HashTable) > alignof(ENTRY) ? alignof(HashTable)
+                                               : alignof(ENTRY);
+  }
+
+  LIBC_INLINE constexpr static size_t offset_to_groups() {
+    return sizeof(HashTable);
+  }
+
+  LIBC_INLINE ENTRY &entry(size_t i) {
+    return reinterpret_cast<ENTRY *>(this)[-i - 1];
+  }
+
+  LIBC_INLINE uint8_t &control(size_t i) {
+    uint8_t *ptr = reinterpret_cast<uint8_t *>(this) + offset_to_groups();
+    return ptr[i];
+  }
+
+  // We duplicate a group of control bytes to the end. Thus, it is possible that
+  // we need to set two control bytes at the same time.
+  LIBC_INLINE void set_ctrl(size_t index, uint8_t value) {
+    size_t index2 = ((index - sizeof(Group)) & entries_mask) + sizeof(Group);
+    control(index) = value;
+    control(index2) = value;
+  }
+
+public:
+  LIBC_INLINE static void deallocate(HashTable *table) {
+    if (table) {
+      void *ptr =
+          reinterpret_cast<uint8_t *>(table) - table->offset_from_entries();
+      operator delete(ptr, std::align_val_t{table_alignment()});
+    }
+  }
+  LIBC_INLINE static HashTable *allocate(size_t capacity, uint64_t randomness) {
+    // check if capacity_to_entries overflows MAX_MEM_SIZE
+    if (capacity > size_t{1} << (8 * sizeof(size_t) - 1 - 3))
+      return nullptr;
+    SafeMemSize entries{capacity_to_entries(capacity)};
+    SafeMemSize entries_size = entries * SafeMemSize{sizeof(ENTRY)};
+    SafeMemSize align_boundary = entries_size.align_up(table_alignment());
+    SafeMemSize ctrl_sizes = entries + SafeMemSize{sizeof(Group)};
+    SafeMemSize header_size{offset_to_groups()};
+    SafeMemSize total_size =
+        (align_boundary + header_size + ctrl_sizes).align_up(table_alignment());
+    if (!total_size.valid())
+      return nullptr;
+    AllocChecker ac;
+
+    void *mem = operator new(total_size, std::align_val_t{table_alignment()},
+                             ac);
+
+    HashTable *table = reinterpret_cast<HashTable *>(
+        static_cast<uint8_t *>(mem) + align_boundary);
+    if (ac) {
+      table->entries_mask = entries - 1u;
+      table->available_slots = entries / 8 * 7;
+      table->state = HashState{randomness};
+      memset(&table->control(0), 0x80, ctrl_sizes);
+      memset(mem, 0, table->offset_from_entries());
+    }
+    return table;
+  }
+
+private:
+  LIBC_INLINE size_t find(const char *key, uint64_t primary) {
+    uint8_t secondary = secondary_hash(primary);
+    ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask};
+    while (true) {
+      size_t pos = sequence.next();
+      Group ctrls = Group::load(&control(pos));
+      IteratableBitMask masks = ctrls.match_byte(secondary);
+      for (size_t i : masks) {
+        size_t index = (pos + i) & entries_mask;
+        ENTRY &entry = this->entry(index);
+        if (LIBC_LIKELY(entry.key != nullptr && strcmp(entry.key, key) == 0))
+          return index;
+      }
+      BitMask available = ctrls.mask_available();
+      // Since there is no deletion, the first time we find an available slot
+      // it is also ready to be used as an insertion point. Therefore, we also
+      // return the first available slot we find. If such entry is empty, the
+      // key will be nullptr.
+      if (LIBC_LIKELY(available.any_bit_set())) {
+        size_t index =
+            (pos + available.lowest_set_bit_nonzero()) & entries_mask;
+        return index;
+      }
+    }
+  }
+
+private:
+  LIBC_INLINE ENTRY *insert(ENTRY item, uint64_t primary) {
+    auto index = find(item.key, primary);
+    auto slot = &this->entry(index);
+    // SVr4 and POSIX.1-2001 specify that action is significant only for
+    // unsuccessful searches, so that an ENTER should not do anything
+    // for a successful search.
+    if (slot->key != nullptr) {
+      return slot;
+    }
----------------
nickdesaulniers wrote:

unnecessary braces

https://github.com/llvm/llvm-project/pull/73469


More information about the libc-commits mailing list