[libc-commits] [libc] [libc] [search] improve hsearch robustness (PR #73896)

Schrodinger ZHU Yifan via libc-commits libc-commits at lists.llvm.org
Wed Nov 29 20:45:59 PST 2023


https://github.com/SchrodingerZhu updated https://github.com/llvm/llvm-project/pull/73896

>From 4eecafa6714f7ad1c62f085618f3c20c6a32a086 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu at rochester.edu>
Date: Wed, 29 Nov 2023 16:50:15 -0500
Subject: [PATCH 1/2] [libc] [search] improve hsearch robustness

According to FreeBSD implementation (https://android.googlesource.com/platform/bionic/+/refs/heads/main/libc/upstream-freebsd/lib/libc/stdlib/hcreate.c), `hsearch` is able to handle the cases where
the global table is not properly initialized. To do this, FreeBSD actually allows hash table to be dynamically resized. If the global table is uninitialized at the first call, the table will be initialized with a minimal size; hence subsequent insertion will be reasonable as the table grows automatically.

This patch mimic such behaviors. More precisely, this patch introduces:

1. a full table iterator that scans each element in the table,
2. a resize routine that is automatically triggered whenever the load factor is reached where
   it iterates the old table and insert the entries into a new one,
3. more tests that cover the growth of the table.
---
 libc/src/__support/HashTable/bitmask.h        |   4 +-
 .../HashTable/generic/bitmask_impl.inc        |  12 +-
 .../__support/HashTable/sse2/bitmask_impl.inc |  11 +-
 libc/src/__support/HashTable/table.h          | 229 ++++++++++++++----
 libc/src/search/CMakeLists.txt                |   1 +
 libc/src/search/hcreate.cpp                   |   6 +
 libc/src/search/hdestroy.cpp                  |   3 +-
 libc/src/search/hsearch.cpp                   |  20 +-
 libc/src/search/hsearch_r.cpp                 |   3 +-
 .../src/__support/HashTable/table_test.cpp    |  75 +++++-
 libc/test/src/search/hsearch_test.cpp         |  25 +-
 11 files changed, 302 insertions(+), 87 deletions(-)

diff --git a/libc/src/__support/HashTable/bitmask.h b/libc/src/__support/HashTable/bitmask.h
index 8247161c449bdd1..956293fe9ede5a0 100644
--- a/libc/src/__support/HashTable/bitmask.h
+++ b/libc/src/__support/HashTable/bitmask.h
@@ -31,9 +31,7 @@ namespace internal {
 // |  Available  | 0b1xxx'xxxx |
 // |  Occupied   | 0b0xxx'xxxx |
 // =============================
-template <typename T, T WORD_MASK, size_t WORD_STRIDE> struct BitMaskAdaptor {
-  // A masked constant whose bits are all set.
-  LIBC_INLINE_VAR constexpr static T MASK = WORD_MASK;
+template <typename T, size_t WORD_STRIDE> struct BitMaskAdaptor {
   // A stride in the bitmask may use multiple bits.
   LIBC_INLINE_VAR constexpr static size_t STRIDE = WORD_STRIDE;
 
diff --git a/libc/src/__support/HashTable/generic/bitmask_impl.inc b/libc/src/__support/HashTable/generic/bitmask_impl.inc
index b8d2bfc7a6ff275..85a359442066832 100644
--- a/libc/src/__support/HashTable/generic/bitmask_impl.inc
+++ b/libc/src/__support/HashTable/generic/bitmask_impl.inc
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/HashTable/bitmask.h"
 #include "src/__support/common.h"
 #include "src/__support/endian.h"
 
@@ -34,7 +35,7 @@ LIBC_INLINE constexpr bitmask_t repeat_byte(bitmask_t byte) {
   return byte;
 }
 
-using BitMask = BitMaskAdaptor<bitmask_t, repeat_byte(0x80), 0x8ull>;
+using BitMask = BitMaskAdaptor<bitmask_t, 0x8ull>;
 using IteratableBitMask = IteratableBitMaskAdaptor<BitMask>;
 
 struct Group {
@@ -51,6 +52,11 @@ struct Group {
     return {data.value};
   }
 
+  // Load a group of control words from an arbitary address.
+  LIBC_INLINE static Group load_aligned(const void *__restrict addr) {
+    return *static_cast<const Group *>(addr);
+  }
+
   // Find out the lanes equal to the given byte and return the bitmask
   // with corresponding bits set.
   LIBC_INLINE IteratableBitMask match_byte(uint8_t byte) const {
@@ -106,6 +112,10 @@ struct Group {
   LIBC_INLINE BitMask mask_available() const {
     return {LIBC_NAMESPACE::Endian::to_little_endian(data) & repeat_byte(0x80)};
   }
+
+  LIBC_INLINE IteratableBitMask occupied() const {
+    return {static_cast<bitmask_t>(mask_available().word ^ repeat_byte(0x80))};
+  }
 };
 } // namespace internal
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/HashTable/sse2/bitmask_impl.inc b/libc/src/__support/HashTable/sse2/bitmask_impl.inc
index 6308f2fed6661c2..de07d22f46cb6af 100644
--- a/libc/src/__support/HashTable/sse2/bitmask_impl.inc
+++ b/libc/src/__support/HashTable/sse2/bitmask_impl.inc
@@ -12,7 +12,7 @@ namespace internal {
 // With SSE2, every bitmask is iteratable as
 // we use single bit to encode the data.
 
-using BitMask = BitMaskAdaptor<uint16_t, 0xffffu, 0x1u>;
+using BitMask = BitMaskAdaptor<uint16_t, 0x1u>;
 using IteratableBitMask = IteratableBitMaskAdaptor<BitMask>;
 
 struct Group {
@@ -23,6 +23,11 @@ struct Group {
     return {_mm_loadu_si128(static_cast<const __m128i *>(addr))};
   }
 
+  // Load a group of control words from an arbitary address.
+  LIBC_INLINE static Group load_aligned(const void *__restrict addr) {
+    return {_mm_load_si128(static_cast<const __m128i *>(addr))};
+  }
+
   // Find out the lanes equal to the given byte and return the bitmask
   // with corresponding bits set.
   LIBC_INLINE IteratableBitMask match_byte(uint8_t byte) const {
@@ -35,6 +40,10 @@ struct Group {
     auto bitmask = static_cast<uint16_t>(_mm_movemask_epi8(data));
     return {bitmask};
   }
+
+  LIBC_INLINE IteratableBitMask occupied() const {
+    return {static_cast<uint16_t>(~mask_available().word)};
+  }
 };
 } // namespace internal
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index ec0ec78869ad582..9a7f185e227d568 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -79,7 +79,7 @@ LIBC_INLINE size_t capacity_to_entries(size_t cap) {
 //             |   N * Entry         |
 //             ======================= <- align boundary
 //             |   Header            |
-//             =======================
+//             ======================= <- align boundary (for fast resize)
 //             |   (N + 1) * Byte    |
 //             =======================
 //
@@ -94,6 +94,20 @@ struct HashTable {
   // How many entries are there in the table.
   LIBC_INLINE size_t num_of_entries() const { return entries_mask + 1; }
 
+  // How many entries can we store in the table before resizing.
+  LIBC_INLINE size_t full_capacity() const { return num_of_entries() / 8 * 7; }
+
+  // The alignment of the whole memory area is the maximum of the alignment
+  // among the following types:
+  // - HashTable
+  // - ENTRY
+  // - Group
+  LIBC_INLINE constexpr static size_t table_alignment() {
+    size_t left_align = alignof(HashTable) > alignof(ENTRY) ? alignof(HashTable)
+                                                            : alignof(ENTRY);
+    return left_align > alignof(Group) ? left_align : alignof(Group);
+  }
+
   LIBC_INLINE bool is_full() const { return available_slots == 0; }
 
   LIBC_INLINE size_t offset_from_entries() const {
@@ -101,24 +115,30 @@ struct HashTable {
     return entries_size + offset_to(entries_size, table_alignment());
   }
 
-  LIBC_INLINE constexpr static size_t table_alignment() {
-    return alignof(HashTable) > alignof(ENTRY) ? alignof(HashTable)
-                                               : alignof(ENTRY);
-  }
-
   LIBC_INLINE constexpr static size_t offset_to_groups() {
-    return sizeof(HashTable);
+    size_t header_size = sizeof(HashTable);
+    return header_size + offset_to(header_size, table_alignment());
   }
 
   LIBC_INLINE ENTRY &entry(size_t i) {
     return reinterpret_cast<ENTRY *>(this)[-i - 1];
   }
 
+  LIBC_INLINE const ENTRY &entry(size_t i) const {
+    return reinterpret_cast<const ENTRY *>(this)[-i - 1];
+  }
+
   LIBC_INLINE uint8_t &control(size_t i) {
     uint8_t *ptr = reinterpret_cast<uint8_t *>(this) + offset_to_groups();
     return ptr[i];
   }
 
+  LIBC_INLINE const uint8_t &control(size_t i) const {
+    const uint8_t *ptr =
+        reinterpret_cast<const uint8_t *>(this) + offset_to_groups();
+    return ptr[i];
+  }
+
   // We duplicate a group of control bytes to the end. Thus, it is possible that
   // we need to set two control bytes at the same time.
   LIBC_INLINE void set_ctrl(size_t index, uint8_t value) {
@@ -127,6 +147,109 @@ struct HashTable {
     control(index2) = value;
   }
 
+  LIBC_INLINE size_t find(const char *key, uint64_t primary) {
+    uint8_t secondary = secondary_hash(primary);
+    ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask};
+    while (true) {
+      size_t pos = sequence.next();
+      Group ctrls = Group::load(&control(pos));
+      IteratableBitMask masks = ctrls.match_byte(secondary);
+      for (size_t i : masks) {
+        size_t index = (pos + i) & entries_mask;
+        ENTRY &entry = this->entry(index);
+        if (LIBC_LIKELY(entry.key != nullptr && strcmp(entry.key, key) == 0))
+          return index;
+      }
+      BitMask available = ctrls.mask_available();
+      // Since there is no deletion, the first time we find an available slot
+      // it is also ready to be used as an insertion point. Therefore, we also
+      // return the first available slot we find. If such entry is empty, the
+      // key will be nullptr.
+      if (LIBC_LIKELY(available.any_bit_set())) {
+        size_t index =
+            (pos + available.lowest_set_bit_nonzero()) & entries_mask;
+        return index;
+      }
+    }
+  }
+
+  LIBC_INLINE uint64_t oneshot_hash(const char *key) const {
+    LIBC_NAMESPACE::internal::HashState hasher = state;
+    hasher.update(key, strlen(key));
+    return hasher.finish();
+  }
+
+  // A fast insertion routine without checking if a key already exists.
+  // Nor does the routine check if the table is full.
+  // This is only to be used in grow() where we insert all existing entries
+  // into a new table. Hence, the requirements are naturally satisfied.
+  LIBC_INLINE void unsafe_insert(ENTRY item) {
+    uint64_t primary = oneshot_hash(item.key);
+    uint8_t secondary = secondary_hash(primary);
+    ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask};
+    while (true) {
+      size_t pos = sequence.next();
+      Group ctrls = Group::load(&control(pos));
+      BitMask available = ctrls.mask_available();
+      if (available.any_bit_set()) {
+        size_t index =
+            (pos + available.lowest_set_bit_nonzero()) & entries_mask;
+        set_ctrl(index, secondary);
+        entry(index).key = item.key;
+        entry(index).data = item.data;
+        available_slots--;
+        return;
+      }
+    }
+  }
+
+  LIBC_INLINE HashTable *grow() const {
+    size_t hint = full_capacity() + 1;
+    HashState state = this->state;
+    // migrate to a new random state
+    state.update(&hint, sizeof(hint));
+    HashTable *new_table = allocate(hint, state.finish());
+    // It is safe to call unsafe_insert() because we know that:
+    // - the new table has enough capacity to hold all the entries
+    // - there is no duplicate key in the old table
+    if (new_table != nullptr)
+      for (ENTRY e : *this)
+        new_table->unsafe_insert(e);
+    return new_table;
+  }
+
+  LIBC_INLINE static ENTRY *insert(HashTable *&table, ENTRY item,
+                                   uint64_t primary) {
+    auto index = table->find(item.key, primary);
+    auto slot = &table->entry(index);
+    // SVr4 and POSIX.1-2001 specify that action is significant only for
+    // unsuccessful searches, so that an ENTER should not do anything
+    // for a successful search.
+    if (slot->key != nullptr)
+      return slot;
+
+    // if table of full, we try to grow the table
+    if (table->is_full()) {
+      HashTable *new_table = table->grow();
+      // allocation failed, return nullptr to indicate failure
+      if (new_table == nullptr)
+        return nullptr;
+      // resized sccuessfully: clean up the old table and use the new one
+      deallocate(table);
+      table = new_table;
+      // hash also need to be recomputed since the hash state is updated
+      primary = table->oneshot_hash(item.key);
+      index = table->find(item.key, primary);
+      slot = &table->entry(index);
+    }
+
+    table->set_ctrl(index, secondary_hash(primary));
+    slot->key = item.key;
+    slot->data = item.data;
+    table->available_slots--;
+    return slot;
+  }
+
 public:
   LIBC_INLINE static void deallocate(HashTable *table) {
     if (table) {
@@ -135,6 +258,7 @@ struct HashTable {
       operator delete(ptr, std::align_val_t{table_alignment()});
     }
   }
+
   LIBC_INLINE static HashTable *allocate(size_t capacity, uint64_t randomness) {
     // check if capacity_to_entries overflows MAX_MEM_SIZE
     if (capacity > size_t{1} << (8 * sizeof(size_t) - 1 - 3))
@@ -165,54 +289,54 @@ struct HashTable {
     return table;
   }
 
-private:
-  LIBC_INLINE size_t find(const char *key, uint64_t primary) {
-    uint8_t secondary = secondary_hash(primary);
-    ProbeSequence sequence{static_cast<size_t>(primary), 0, entries_mask};
-    while (true) {
-      size_t pos = sequence.next();
-      Group ctrls = Group::load(&control(pos));
-      IteratableBitMask masks = ctrls.match_byte(secondary);
-      for (size_t i : masks) {
-        size_t index = (pos + i) & entries_mask;
-        ENTRY &entry = this->entry(index);
-        if (LIBC_LIKELY(entry.key != nullptr && strcmp(entry.key, key) == 0))
-          return index;
-      }
-      BitMask available = ctrls.mask_available();
-      // Since there is no deletion, the first time we find an available slot
-      // it is also ready to be used as an insertion point. Therefore, we also
-      // return the first available slot we find. If such entry is empty, the
-      // key will be nullptr.
-      if (LIBC_LIKELY(available.any_bit_set())) {
-        size_t index =
-            (pos + available.lowest_set_bit_nonzero()) & entries_mask;
-        return index;
-      }
+  struct FullTableIterator {
+    size_t current_offset;
+    size_t remaining;
+    IteratableBitMask current_mask;
+    const HashTable &table;
+
+    // It is fine to use remaining to represent the iterator:
+    // - this comparison only happens with the same table
+    // - hashtable will not be mutated during the iteration
+    LIBC_INLINE bool operator==(const FullTableIterator &other) const {
+      return remaining == other.remaining;
+    }
+    LIBC_INLINE bool operator!=(const FullTableIterator &other) const {
+      return remaining != other.remaining;
     }
-  }
 
-private:
-  LIBC_INLINE ENTRY *insert(ENTRY item, uint64_t primary) {
-    auto index = find(item.key, primary);
-    auto slot = &this->entry(index);
-    // SVr4 and POSIX.1-2001 specify that action is significant only for
-    // unsuccessful searches, so that an ENTER should not do anything
-    // for a successful search.
-    if (slot->key != nullptr)
-      return slot;
+    LIBC_INLINE FullTableIterator &operator++() {
+      this->ensure_valid_group();
+      current_mask.remove_lowest_bit();
+      remaining--;
+      return *this;
+    }
+    LIBC_INLINE const ENTRY &operator*() {
+      this->ensure_valid_group();
+      return table.entry(
+          (current_offset + current_mask.lowest_set_bit_nonzero()) &
+          table.entries_mask);
+    }
 
-    if (!is_full()) {
-      set_ctrl(index, secondary_hash(primary));
-      slot->key = item.key;
-      slot->data = item.data;
-      available_slots--;
-      return slot;
+  private:
+    LIBC_INLINE void ensure_valid_group() {
+      while (!current_mask.any_bit_set()) {
+        current_offset += sizeof(Group);
+        // It is ensured that the load will only happen at aligned boundaries.
+        current_mask =
+            Group::load_aligned(&table.control(current_offset)).occupied();
+      }
     }
-    return nullptr;
+  };
+
+  using value_type = ENTRY;
+  using iterator = FullTableIterator;
+  iterator begin() const {
+    return {0, full_capacity() - available_slots,
+            Group::load_aligned(&control(0)).occupied(), *this};
   }
+  iterator end() const { return {0, 0, {0}, *this}; }
 
-public:
   LIBC_INLINE ENTRY *find(const char *key) {
     LIBC_NAMESPACE::internal::HashState hasher = state;
     hasher.update(key, strlen(key));
@@ -222,11 +346,12 @@ struct HashTable {
       return nullptr;
     return &entry;
   }
-  LIBC_INLINE ENTRY *insert(ENTRY item) {
-    LIBC_NAMESPACE::internal::HashState hasher = state;
+
+  LIBC_INLINE static ENTRY *insert(HashTable *&table, ENTRY item) {
+    LIBC_NAMESPACE::internal::HashState hasher = table->state;
     hasher.update(item.key, strlen(item.key));
     uint64_t primary = hasher.finish();
-    return insert(item, primary);
+    return insert(table, item, primary);
   }
 };
 } // namespace internal
diff --git a/libc/src/search/CMakeLists.txt b/libc/src/search/CMakeLists.txt
index 4ae5274a3ba9811..29cbe41c2565d2a 100644
--- a/libc/src/search/CMakeLists.txt
+++ b/libc/src/search/CMakeLists.txt
@@ -36,6 +36,7 @@ add_entrypoint_object(
   DEPENDS
     libc.src.search.hsearch.global
     libc.src.__support.HashTable.table
+    libc.src.__support.HashTable.randomness
     libc.src.__support.libc_assert
     libc.src.errno.errno
     libc.include.search
diff --git a/libc/src/search/hcreate.cpp b/libc/src/search/hcreate.cpp
index 9c05e317a2d05f3..8d638a2c4aa13db 100644
--- a/libc/src/search/hcreate.cpp
+++ b/libc/src/search/hcreate.cpp
@@ -14,6 +14,12 @@
 
 namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(int, hcreate, (size_t capacity)) {
+  // We follow FreeBSD's implementation here. If the global_hash_table is
+  // already initialized, this function will do nothing and return 1.
+  // https://android.googlesource.com/platform/bionic/+/refs/heads/main/libc/upstream-freebsd/lib/libc/stdlib/hcreate.c
+  if (internal::global_hash_table != nullptr)
+    return 1;
+
   uint64_t randomness = internal::randomness::next_random_seed();
   internal::HashTable *table =
       internal::HashTable::allocate(capacity, randomness);
diff --git a/libc/src/search/hdestroy.cpp b/libc/src/search/hdestroy.cpp
index 1af64f195e326e3..7446bd627af77a3 100644
--- a/libc/src/search/hdestroy.cpp
+++ b/libc/src/search/hdestroy.cpp
@@ -13,7 +13,8 @@
 
 namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(void, hdestroy, (void)) {
-  LIBC_ASSERT(internal::global_hash_table != nullptr);
+  // HashTable::deallocate will check for nullptr. It will be a no-op if
+  // global_hash_table is null.
   internal::HashTable::deallocate(internal::global_hash_table);
   internal::global_hash_table = nullptr;
 }
diff --git a/libc/src/search/hsearch.cpp b/libc/src/search/hsearch.cpp
index 3a0d09aae835b06..4236648f5325a8c 100644
--- a/libc/src/search/hsearch.cpp
+++ b/libc/src/search/hsearch.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/search/hsearch.h"
+#include "src/__support/HashTable/randomness.h"
 #include "src/__support/HashTable/table.h"
 #include "src/__support/libc_assert.h"
 #include "src/errno/libc_errno.h"
@@ -15,16 +16,29 @@
 namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(ENTRY *, hsearch, (ENTRY item, ACTION action)) {
   ENTRY *result;
-  LIBC_ASSERT(internal::global_hash_table != nullptr);
+  if (internal::global_hash_table == nullptr) {
+    // If global_hash_table is null, we create a new hash table with a minimal
+    // capacity. Such hashtable will be expanded as needed.
+    uint64_t randomness = internal::randomness::next_random_seed();
+    internal::global_hash_table = internal::HashTable::allocate(0, randomness);
+  }
+
+  // In rare cases, the global hashtable may still fail to allocate. We treat it
+  // as ESRCH or ENOMEM depending on the action.
   switch (action) {
   case FIND:
-    result = internal::global_hash_table->find(item.key);
+    result = internal::global_hash_table
+                 ? internal::global_hash_table->find(item.key)
+                 : nullptr;
     if (result == nullptr) {
       libc_errno = ESRCH;
     }
     break;
   case ENTER:
-    result = internal::global_hash_table->insert(item);
+    result =
+        internal::global_hash_table
+            ? internal::HashTable::insert(internal::global_hash_table, item)
+            : nullptr;
     if (result == nullptr) {
       libc_errno = ENOMEM;
     }
diff --git a/libc/src/search/hsearch_r.cpp b/libc/src/search/hsearch_r.cpp
index 958fba7c00d0d46..a2c3a86eded6e21 100644
--- a/libc/src/search/hsearch_r.cpp
+++ b/libc/src/search/hsearch_r.cpp
@@ -29,7 +29,8 @@ LLVM_LIBC_FUNCTION(int, hsearch_r,
     }
     break;
   case ENTER:
-    *retval = table->insert(item);
+    *retval = internal::HashTable::insert(table, item);
+    htab->__opaque = table;
     if (*retval == nullptr) {
       libc_errno = ENOMEM;
       return 0;
diff --git a/libc/test/src/__support/HashTable/table_test.cpp b/libc/test/src/__support/HashTable/table_test.cpp
index f0aa82f2d5c768b..b76f7a7a1231441 100644
--- a/libc/test/src/__support/HashTable/table_test.cpp
+++ b/libc/test/src/__support/HashTable/table_test.cpp
@@ -29,13 +29,63 @@ TEST(LlvmLibcTableTest, AllocationAndDeallocation) {
   HashTable::deallocate(nullptr);
 }
 
+TEST(LlvmLibcTableTest, Iteration) {
+  constexpr size_t TEST_SIZE = 512;
+  size_t counter[TEST_SIZE];
+  struct key {
+    uint8_t bytes[3];
+  } keys[TEST_SIZE];
+  HashTable *table = HashTable::allocate(0, 0x7f7f7f7f7f7f7f7f);
+  ASSERT_NE(table, static_cast<HashTable *>(nullptr));
+  for (size_t i = 0; i < TEST_SIZE; ++i) {
+    counter[i] = 0;
+    if (i >= 256) {
+      keys[i].bytes[0] = 2;
+      keys[i].bytes[1] = i % 256;
+      keys[i].bytes[2] = 0;
+    } else {
+      keys[i].bytes[0] = 1;
+      keys[i].bytes[1] = i;
+      keys[i].bytes[2] = 0;
+    }
+    HashTable::insert(table, {reinterpret_cast<char *>(keys[i].bytes),
+                              reinterpret_cast<void *>((size_t)i)});
+  }
+
+  size_t count = 0;
+  for (const ENTRY &e : *table) {
+    size_t data = reinterpret_cast<size_t>(e.data);
+    ++counter[data];
+    ++count;
+  }
+  ASSERT_EQ(count, TEST_SIZE);
+  for (size_t i = 0; i < TEST_SIZE; ++i) {
+    ASSERT_EQ(counter[i], static_cast<size_t>(1));
+  }
+  HashTable::deallocate(table);
+}
+
+// Check if resize works correctly. This test actually covers two things:
+// - The sizes are indeed growing.
+// - The sizes are growing rapidly enough to reach the upper bound.
+TEST(LlvmLibcTableTest, GrowthSequence) {
+  size_t cap = capacity_to_entries(0);
+  // right shift 4 to avoid overflow ssize_t.
+  while (cap < static_cast<size_t>(-1) >> 4u) {
+    size_t hint = cap / 8 * 7 + 1;
+    size_t new_cap = capacity_to_entries(hint);
+    ASSERT_GT(new_cap, cap);
+    cap = new_cap;
+  }
+}
+
 TEST(LlvmLibcTableTest, Insertion) {
   union key {
-    uint64_t value;
-    char bytes[8];
+    char bytes[2];
   } keys[256];
   for (size_t k = 0; k < 256; ++k) {
-    keys[k].value = LIBC_NAMESPACE::Endian::to_little_endian(k);
+    keys[k].bytes[0] = static_cast<char>(k);
+    keys[k].bytes[1] = 0;
   }
   constexpr size_t CAP = next_power_of_two((sizeof(Group) + 1) * 8 / 7) / 8 * 7;
   static_assert(CAP + 1 < 256, "CAP is too large for this test.");
@@ -45,27 +95,28 @@ TEST(LlvmLibcTableTest, Insertion) {
 
   // insert to full capacity.
   for (size_t i = 0; i < CAP; ++i) {
-    ASSERT_NE(table->insert({keys[i].bytes, keys[i].bytes}),
+    ASSERT_NE(HashTable::insert(table, {keys[i].bytes, keys[i].bytes}),
               static_cast<ENTRY *>(nullptr));
   }
 
-  // one more insert should fail.
-  ASSERT_EQ(table->insert({keys[CAP + 1].bytes, keys[CAP + 1].bytes}),
+  // one more insert should grow successfully.
+  ASSERT_NE(HashTable::insert(table, {keys[CAP].bytes, keys[CAP].bytes}),
             static_cast<ENTRY *>(nullptr));
 
-  for (size_t i = 0; i < CAP; ++i) {
+  for (size_t i = 0; i <= CAP; ++i) {
     ASSERT_EQ(strcmp(table->find(keys[i].bytes)->key, keys[i].bytes), 0);
   }
-  for (size_t i = CAP; i < 256; ++i) {
+  for (size_t i = CAP + 1; i < 256; ++i) {
     ASSERT_EQ(table->find(keys[i].bytes), static_cast<ENTRY *>(nullptr));
   }
 
   // do not replace old value
-  for (size_t i = 0; i < CAP; ++i) {
-    ASSERT_NE(table->insert({keys[i].bytes, reinterpret_cast<void *>(i)}),
-              static_cast<ENTRY *>(nullptr));
+  for (size_t i = 0; i <= CAP; ++i) {
+    ASSERT_NE(
+        HashTable::insert(table, {keys[i].bytes, reinterpret_cast<void *>(i)}),
+        static_cast<ENTRY *>(nullptr));
   }
-  for (size_t i = 0; i < CAP; ++i) {
+  for (size_t i = 0; i <= CAP; ++i) {
     ASSERT_EQ(table->find(keys[i].bytes)->data,
               reinterpret_cast<void *>(keys[i].bytes));
   }
diff --git a/libc/test/src/search/hsearch_test.cpp b/libc/test/src/search/hsearch_test.cpp
index bc9dea748758ac0..fcd6660220e0833 100644
--- a/libc/test/src/search/hsearch_test.cpp
+++ b/libc/test/src/search/hsearch_test.cpp
@@ -51,17 +51,16 @@ constexpr size_t CAP =
     LIBC_NAMESPACE::next_power_of_two((GROUP_SIZE + 1) * 8 / 7) / 8 * 7;
 static_assert(CAP < sizeof(search_data), "CAP too large");
 
-TEST(LlvmLibcHSearchTest, InsertTooMany) {
+TEST(LlvmLibcHSearchTest, GrowFromZero) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
-  ASSERT_GT(LIBC_NAMESPACE::hcreate(GROUP_SIZE + 1), 0);
-
-  for (size_t i = 0; i < CAP; ++i) {
-    ASSERT_EQ(LIBC_NAMESPACE::hsearch({&search_data[i], nullptr}, ENTER)->key,
-              &search_data[i]);
+  ASSERT_GT(LIBC_NAMESPACE::hcreate(0), 0);
+  for (size_t i = 0; i < sizeof(search_data) - 1; ++i) {
+    ENTRY *inserted = LIBC_NAMESPACE::hsearch(
+        {&search_data[i], reinterpret_cast<void *>(i)}, ENTER);
+    ASSERT_NE(inserted, static_cast<ENTRY *>(nullptr));
+    ASSERT_EQ(inserted->key, &search_data[i]);
   }
-  ASSERT_THAT(static_cast<void *>(
-                  LIBC_NAMESPACE::hsearch({search_data2, nullptr}, ENTER)),
-              Fails(ENOMEM, static_cast<void *>(nullptr)));
+
   LIBC_NAMESPACE::hdestroy();
 }
 
@@ -85,10 +84,10 @@ TEST(LlvmLibcHSearchTest, Found) {
   using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
   ASSERT_GT(LIBC_NAMESPACE::hcreate(GROUP_SIZE + 1), 0);
   for (size_t i = 0; i < CAP; ++i) {
-    ASSERT_EQ(LIBC_NAMESPACE::hsearch(
-                  {&search_data[i], reinterpret_cast<void *>(i)}, ENTER)
-                  ->key,
-              &search_data[i]);
+    ENTRY *inserted = LIBC_NAMESPACE::hsearch(
+        {&search_data[i], reinterpret_cast<void *>(i)}, ENTER);
+    ASSERT_NE(inserted, static_cast<ENTRY *>(nullptr));
+    ASSERT_EQ(inserted->key, &search_data[i]);
   }
   for (size_t i = 0; i < CAP; ++i) {
     ASSERT_EQ(LIBC_NAMESPACE::hsearch({&search_data[i], nullptr}, FIND)->data,

>From c7ed1bc3e5fe9b6e55f962cb70f598dd4e31f7e1 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu at rochester.edu>
Date: Wed, 29 Nov 2023 23:45:42 -0500
Subject: [PATCH 2/2] fix build errors

---
 libc/src/__support/HashTable/table.h               | 8 ++------
 libc/test/src/__support/HashTable/bitmask_test.cpp | 6 +++---
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/libc/src/__support/HashTable/table.h b/libc/src/__support/HashTable/table.h
index 9a7f185e227d568..706e1b0bdbf5dd7 100644
--- a/libc/src/__support/HashTable/table.h
+++ b/libc/src/__support/HashTable/table.h
@@ -338,9 +338,7 @@ struct HashTable {
   iterator end() const { return {0, 0, {0}, *this}; }
 
   LIBC_INLINE ENTRY *find(const char *key) {
-    LIBC_NAMESPACE::internal::HashState hasher = state;
-    hasher.update(key, strlen(key));
-    uint64_t primary = hasher.finish();
+    uint64_t primary = oneshot_hash(key);
     ENTRY &entry = this->entry(find(key, primary));
     if (entry.key == nullptr)
       return nullptr;
@@ -348,9 +346,7 @@ struct HashTable {
   }
 
   LIBC_INLINE static ENTRY *insert(HashTable *&table, ENTRY item) {
-    LIBC_NAMESPACE::internal::HashState hasher = table->state;
-    hasher.update(item.key, strlen(item.key));
-    uint64_t primary = hasher.finish();
+    uint64_t primary = table->oneshot_hash(item.key);
     return insert(table, item, primary);
   }
 };
diff --git a/libc/test/src/__support/HashTable/bitmask_test.cpp b/libc/test/src/__support/HashTable/bitmask_test.cpp
index c816c5d10638897..5203220e9b5cfa8 100644
--- a/libc/test/src/__support/HashTable/bitmask_test.cpp
+++ b/libc/test/src/__support/HashTable/bitmask_test.cpp
@@ -11,8 +11,8 @@
 namespace LIBC_NAMESPACE {
 namespace internal {
 
-using ShortBitMask = BitMaskAdaptor<uint16_t, 0xffff, 1>;
-using LargeBitMask = BitMaskAdaptor<uint64_t, 0x80'80'80'80'80'80'80'80, 8>;
+using ShortBitMask = BitMaskAdaptor<uint16_t, 1>;
+using LargeBitMask = BitMaskAdaptor<uint64_t, 8>;
 
 TEST(LlvmLibcHashTableBitMaskTest, SingleBitStrideLowestSetBit) {
   uint16_t data = 0xffff;
@@ -53,7 +53,7 @@ TEST(LlvmLibcHashTableBitMaskTest, SingleBitStrideIteration) {
 
 TEST(LlvmLibcHashTableBitMaskTest, MultiBitStrideIteration) {
   using Iter = IteratableBitMaskAdaptor<LargeBitMask>;
-  uint64_t data = Iter::MASK;
+  uint64_t data = 0x8080808080808080ul;
   for (size_t i = 0; i < 8; ++i) {
     Iter iter = {data};
     size_t j = i;



More information about the libc-commits mailing list