[libc-commits] [libc] [libc] Add regex AST and ExprPool (PR #198728)
Alexey Samsonov via libc-commits
libc-commits at lists.llvm.org
Thu Jun 18 22:22:35 PDT 2026
================
@@ -0,0 +1,163 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Pool for Regular Expression AST nodes (Implementation).
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/regex/regex_expr_pool.h"
+#include "hdr/regex_macros.h"
+#include "src/__support/CPP/new.h"
+#include "src/__support/alloc-checker.h"
+#include "src/__support/hash.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+// Hash an Expr node for hash-consing.
+uint64_t hash_expr(const Expr &e) {
+ // Initialise HashState with a constant seed. The specific value (0x12345678)
+ // is an arbitrary placeholder; HashState immediately mixes this seed with
+ // high-entropy constants (derived from aHash) to produce a strong hash, while
+ // the constant value guarantees deterministic hashing for hash-consing.
+ internal::HashState hasher(0x12345678);
+ uint64_t kind = static_cast<uint64_t>(e.kind);
+ hasher.update(&kind, sizeof(kind));
+ switch (e.kind) {
+ case ExprKind::Literal:
+ hasher.update(&e.ch, sizeof(e.ch));
+ break;
+ case ExprKind::Concat:
+ case ExprKind::Alt:
+ hasher.update(&e.bin.left, sizeof(e.bin.left));
+ hasher.update(&e.bin.right, sizeof(e.bin.right));
+ break;
+ default:
+ break;
+ }
+ return hasher.finish();
+}
+
+} // namespace
+
+ExprPool::Block::Block() : next(nullptr), used(0) {}
+
+ExprPool::ExprPool() : head(nullptr), current(nullptr), node_count(0) {
+ AllocChecker ac;
+ hashtable = new (ac) Expr *[HASH_TABLE_SIZE];
+ if (ac) {
+ for (size_t i = 0; i < HASH_TABLE_SIZE; ++i)
+ hashtable[i] = nullptr;
+ }
+}
+
+ExprPool::~ExprPool() {
+ if (hashtable)
+ delete[] hashtable;
+ Block *b = head;
+ while (b) {
+ Block *next_b = b->next;
+ delete b;
+ b = next_b;
+ }
+}
+
+cpp::expected<Expr *, int> ExprPool::intern(const Expr &e) {
+ if (!hashtable)
+ return cpp::unexpected(REG_ESPACE);
+
+ // 1. Calculate the initial bucket for the given structural definition.
+ uint64_t h = hash_expr(e);
+ size_t idx = h % HASH_TABLE_SIZE;
+
+ // 2. Linear Probing: Search for an existing node with identical content.
+ // Because pointers are unique, O(1) comparison is guaranteed if
+ // sub-expressions are already interned.
+ size_t start_idx = idx;
+ while (hashtable[idx]) {
+ if (*hashtable[idx] == e)
+ return hashtable[idx];
+ idx = (idx + 1) % HASH_TABLE_SIZE;
+ if (idx == start_idx) {
+ // Table full (invariant check: HASH_TABLE_SIZE >> MAX_NODE_LIMIT)
+ return cpp::unexpected(REG_ESPACE);
+ }
+ }
+
+ // 3. Admission Control: Check the hard limit on AST nodes.
+ if (node_count >= MAX_NODE_LIMIT)
+ return cpp::unexpected(REG_ESPACE);
+
+ // 4. Arena Allocation: If no matching node found, allocate a stable slot.
+ if (!current || current->used == Block::BLOCK_SIZE) {
+ // New blocks are allocated on demand using AllocChecker.
+ AllocChecker ac;
+ Block *new_block = new (ac) Block();
+ if (!ac)
+ return cpp::unexpected(REG_ESPACE);
+ if (!head)
+ head = new_block;
+ if (current)
+ current->next = new_block;
+ current = new_block;
+ }
+
+ // 5. Node Initialisation: Copy the structural definition into the arena.
+ Expr *new_node = ¤t->nodes[current->used];
+ ++current->used;
+ LIBC_CRASH_ON_NULLPTR(new_node);
----------------
vonosmas wrote:
This check is confusing / extraneous? If current was `nullptr` and you failed to allocate it, you'll fail with `REG_ESPACE` above. But, more importantly, you already write to `current` block in the previous line, so even if null-deref was possible, it would've happened before. I don't see why you can't have:
https://github.com/llvm/llvm-project/pull/198728
More information about the libc-commits
mailing list