[libc-commits] [libc] [libc] Add regex AST and ExprPool (PR #198728)

Alexey Samsonov via libc-commits libc-commits at lists.llvm.org
Thu Jun 18 22:22:35 PDT 2026


================
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Pool for Regular Expression AST nodes (Implementation).
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/regex/regex_expr_pool.h"
+#include "hdr/regex_macros.h"
+#include "src/__support/CPP/new.h"
+#include "src/__support/alloc-checker.h"
+#include "src/__support/hash.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+// Hash an Expr node for hash-consing.
+uint64_t hash_expr(const Expr &e) {
+  // Initialise HashState with a constant seed. The specific value (0x12345678)
+  // is an arbitrary placeholder; HashState immediately mixes this seed with
+  // high-entropy constants (derived from aHash) to produce a strong hash, while
+  // the constant value guarantees deterministic hashing for hash-consing.
+  internal::HashState hasher(0x12345678);
+  uint64_t kind = static_cast<uint64_t>(e.kind);
+  hasher.update(&kind, sizeof(kind));
+  switch (e.kind) {
+  case ExprKind::Literal:
+    hasher.update(&e.ch, sizeof(e.ch));
+    break;
+  case ExprKind::Concat:
+  case ExprKind::Alt:
+    hasher.update(&e.bin.left, sizeof(e.bin.left));
+    hasher.update(&e.bin.right, sizeof(e.bin.right));
+    break;
+  default:
+    break;
+  }
+  return hasher.finish();
+}
+
+} // namespace
+
+ExprPool::Block::Block() : next(nullptr), used(0) {}
+
+ExprPool::ExprPool() : head(nullptr), current(nullptr), node_count(0) {
+  AllocChecker ac;
+  hashtable = new (ac) Expr *[HASH_TABLE_SIZE];
+  if (ac) {
+    for (size_t i = 0; i < HASH_TABLE_SIZE; ++i)
+      hashtable[i] = nullptr;
+  }
+}
+
+ExprPool::~ExprPool() {
+  if (hashtable)
+    delete[] hashtable;
+  Block *b = head;
+  while (b) {
+    Block *next_b = b->next;
+    delete b;
+    b = next_b;
+  }
+}
+
+cpp::expected<Expr *, int> ExprPool::intern(const Expr &e) {
+  if (!hashtable)
+    return cpp::unexpected(REG_ESPACE);
+
+  // 1. Calculate the initial bucket for the given structural definition.
+  uint64_t h = hash_expr(e);
+  size_t idx = h % HASH_TABLE_SIZE;
+
+  // 2. Linear Probing: Search for an existing node with identical content.
+  //    Because pointers are unique, O(1) comparison is guaranteed if
+  //    sub-expressions are already interned.
+  size_t start_idx = idx;
+  while (hashtable[idx]) {
+    if (*hashtable[idx] == e)
+      return hashtable[idx];
+    idx = (idx + 1) % HASH_TABLE_SIZE;
+    if (idx == start_idx) {
+      // Table full (invariant check: HASH_TABLE_SIZE >> MAX_NODE_LIMIT)
+      return cpp::unexpected(REG_ESPACE);
+    }
+  }
+
+  // 3. Admission Control: Check the hard limit on AST nodes.
+  if (node_count >= MAX_NODE_LIMIT)
+    return cpp::unexpected(REG_ESPACE);
+
+  // 4. Arena Allocation: If no matching node found, allocate a stable slot.
+  if (!current || current->used == Block::BLOCK_SIZE) {
+    // New blocks are allocated on demand using AllocChecker.
+    AllocChecker ac;
+    Block *new_block = new (ac) Block();
+    if (!ac)
+      return cpp::unexpected(REG_ESPACE);
+    if (!head)
+      head = new_block;
+    if (current)
+      current->next = new_block;
+    current = new_block;
+  }
+
+  // 5. Node Initialisation: Copy the structural definition into the arena.
+  Expr *new_node = &current->nodes[current->used];
+  ++current->used;
+  LIBC_CRASH_ON_NULLPTR(new_node);
----------------
vonosmas wrote:

This check is confusing / extraneous? If current was `nullptr` and you failed to allocate it, you'll fail with `REG_ESPACE` above. But, more importantly, you already write to `current` block in the previous line, so even if null-deref was possible, it would've happened before. I don't see why you can't have:

https://github.com/llvm/llvm-project/pull/198728


More information about the libc-commits mailing list