[libc-commits] [libc] [libc] Add regex AST and ExprPool (PR #198728)
Jeff Bailey via libc-commits
libc-commits at lists.llvm.org
Fri May 22 10:21:47 PDT 2026
https://github.com/kaladron updated https://github.com/llvm/llvm-project/pull/198728
>From 869445621a4b06b7acd899195fe2b21dc6d76770 Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey at raspberryginger.com>
Date: Fri, 22 May 2026 18:21:32 +0100
Subject: [PATCH] [libc] Add regex AST and ExprPool with staged testing
Implemented the core AST nodes and the ExprPool arena-based allocator.
Integrated internal unit tests for AST algebraic identities and pool
memory limits. Adopted the hdr/ proxy pattern for regex macros and
standardised file documentation.
Assisted-by: Automated tooling, human reviewed.
---
libc/hdr/CMakeLists.txt | 1 +
libc/hdr/regex_macros.h | 4 +
libc/src/__support/regex/CMakeLists.txt | 46 ++++++
libc/src/__support/regex/regex_ast.h | 82 ++++++++++
libc/src/__support/regex/regex_expr_pool.cpp | 151 +++++++++++++++++++
libc/src/__support/regex/regex_expr_pool.h | 116 ++++++++++++++
libc/test/src/__support/CMakeLists.txt | 1 +
libc/test/src/__support/regex/CMakeLists.txt | 10 ++
libc/test/src/__support/regex/expr_test.cpp | 88 +++++++++++
9 files changed, 499 insertions(+)
create mode 100644 libc/src/__support/regex/CMakeLists.txt
create mode 100644 libc/src/__support/regex/regex_ast.h
create mode 100644 libc/src/__support/regex/regex_expr_pool.cpp
create mode 100644 libc/src/__support/regex/regex_expr_pool.h
create mode 100644 libc/test/src/__support/regex/CMakeLists.txt
create mode 100644 libc/test/src/__support/regex/expr_test.cpp
diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 8da1e2a2a1872..eb79298ae84cf 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -382,6 +382,7 @@ add_proxy_header_library(
regex_macros.h
FULL_BUILD_DEPENDS
libc.include.llvm-libc-macros.regex_macros
+ libc.include.regex
)
add_subdirectory(types)
diff --git a/libc/hdr/regex_macros.h b/libc/hdr/regex_macros.h
index 74b5d4be20ff5..c188e0fa2588d 100644
--- a/libc/hdr/regex_macros.h
+++ b/libc/hdr/regex_macros.h
@@ -18,6 +18,10 @@
#include "include/llvm-libc-macros/regex-macros.h"
+#else // Overlay mode
+
+#include <regex.h>
+
#endif // LIBC_FULL_BUILD
#endif // LLVM_LIBC_HDR_REGEX_MACROS_H
diff --git a/libc/src/__support/regex/CMakeLists.txt b/libc/src/__support/regex/CMakeLists.txt
new file mode 100644
index 0000000000000..92a3bff52f5fd
--- /dev/null
+++ b/libc/src/__support/regex/CMakeLists.txt
@@ -0,0 +1,46 @@
+add_object_library(
+ regex_expr_pool
+ SRCS
+ regex_expr_pool.cpp
+ HDRS
+ regex_expr_pool.h
+ regex_ast.h
+ DEPENDS
+ libc.src.__support.CPP.new
+ libc.src.__support.hash
+ libc.src.__support.macros.config
+)
+
+add_object_library(
+ regex_parser
+ SRCS
+ regex_parser.cpp
+ HDRS
+ regex_parser.h
+ DEPENDS
+ .regex_expr_pool
+ libc.src.__support.CPP.expected
+ libc.src.__support.macros.config
+ libc.include.llvm-libc-macros.regex_macros
+)
+
+add_object_library(
+ regex_matcher
+ SRCS
+ regex_matcher.cpp
+ HDRS
+ regex_matcher.h
+ DEPENDS
+ .regex_expr_pool
+ libc.src.__support.macros.config
+ libc.include.llvm-libc-macros.regex_macros
+)
+
+add_header_library(
+ regex_internal
+ HDRS
+ regex_internal.h
+ DEPENDS
+ .regex_expr_pool
+ libc.src.__support.macros.config
+)
diff --git a/libc/src/__support/regex/regex_ast.h b/libc/src/__support/regex/regex_ast.h
new file mode 100644
index 0000000000000..db8485876ad64
--- /dev/null
+++ b/libc/src/__support/regex/regex_ast.h
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// AST nodes for Regular Expressions (Class Definitions).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_AST_H
+#define LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_AST_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+/// Enumeration of Regular Expression AST node types.
+enum class ExprKind {
+ /// Represents the empty set (matches nothing).
+ EmptySet,
+ /// Represents the empty string (matches the empty string).
+ EmptyStr,
+ /// A literal character match.
+ Literal,
+ /// Concatenation of two expressions (left followed by right).
+ Concat,
+ /// Alternation between two expressions (left or right).
+ Alt,
+};
+
+/// A node in the Regular Expression Abstract Syntax Tree.
+///
+/// Expressions are represented as a hash-consed DAG to enable efficient
+/// derivative-based matching. This structure is intended to be managed by
+/// an ExprPool.
+struct Expr {
+ /// The type of this expression node.
+ ExprKind kind;
+ union {
+ /// Character value for Literal nodes.
+ char ch;
+ /// Sub-expressions for Concat and Alt nodes.
+ struct {
+ Expr *left;
+ Expr *right;
+ } bin;
+ };
+
+ /// Default constructor creates an EmptySet node.
+ constexpr Expr() : kind(ExprKind::EmptySet), ch('\0') {}
+ /// Create a node of a specific kind with no data.
+ constexpr Expr(ExprKind k) : kind(k), ch('\0') {}
+ /// Create a Literal node.
+ constexpr Expr(char c) : kind(ExprKind::Literal), ch(c) {}
+ /// Create a binary node (Concat or Alt).
+ constexpr Expr(ExprKind k, Expr *l, Expr *r) : kind(k), bin{l, r} {}
+
+ /// Equivalence check for hash-consing.
+ bool operator==(const Expr &other) const {
+ if (kind != other.kind)
+ return false;
+ switch (kind) {
+ case ExprKind::EmptySet:
+ case ExprKind::EmptyStr:
+ return true;
+ case ExprKind::Literal:
+ return ch == other.ch;
+ case ExprKind::Concat:
+ case ExprKind::Alt:
+ return bin.left == other.bin.left && bin.right == other.bin.right;
+ }
+ return false;
+ }
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_AST_H
diff --git a/libc/src/__support/regex/regex_expr_pool.cpp b/libc/src/__support/regex/regex_expr_pool.cpp
new file mode 100644
index 0000000000000..6ee315387788b
--- /dev/null
+++ b/libc/src/__support/regex/regex_expr_pool.cpp
@@ -0,0 +1,151 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Pool for Regular Expression AST nodes (Implementation).
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/regex/regex_expr_pool.h"
+#include "hdr/regex_macros.h"
+#include "src/__support/CPP/new.h"
+#include "src/__support/alloc-checker.h"
+#include "src/__support/hash.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+// Hash an Expr node for hash-consing.
+uint64_t hash_expr(const Expr &e) {
+ internal::HashState hasher(0x12345678);
+ uint64_t kind = static_cast<uint64_t>(e.kind);
+ hasher.update(&kind, sizeof(kind));
+ switch (e.kind) {
+ case ExprKind::Literal:
+ hasher.update(&e.ch, sizeof(e.ch));
+ break;
+ case ExprKind::Concat:
+ case ExprKind::Alt:
+ hasher.update(&e.bin.left, sizeof(e.bin.left));
+ hasher.update(&e.bin.right, sizeof(e.bin.right));
+ break;
+ default:
+ break;
+ }
+ return hasher.finish();
+}
+
+} // namespace
+
+ExprPool::Block::Block() : next(nullptr), used(0) {}
+
+ExprPool::ExprPool() : head(nullptr), current(nullptr), node_count(0) {
+ for (size_t i = 0; i < HASH_TABLE_SIZE; ++i)
+ hashtable[i] = nullptr;
+}
+
+ExprPool::~ExprPool() {
+ // TODO: This manual traversal can be simplified once cpp::forward_list
+ // is available for block management.
+ Block *b = head;
+ while (b) {
+ Block *next_b = b->next;
+ delete b;
+ b = next_b;
+ }
+}
+
+cpp::expected<Expr *, int> ExprPool::intern(const Expr &e) {
+ // 1. Calculate the initial bucket for the given structural definition.
+ uint64_t h = hash_expr(e);
+ size_t idx = h & (HASH_TABLE_SIZE - 1);
+
+ // 2. Linear Probing: Search for an existing node with identical content.
+ // Because pointers are unique, O(1) comparison is guaranteed if
+ // sub-expressions are already interned.
+ size_t start_idx = idx;
+ while (hashtable[idx]) {
+ if (*hashtable[idx] == e)
+ return hashtable[idx];
+ idx = (idx + 1) & (HASH_TABLE_SIZE - 1);
+ if (idx == start_idx) {
+ // Table full (invariant check: HASH_TABLE_SIZE >> MAX_NODE_LIMIT)
+ return cpp::unexpected(REG_ESPACE);
+ }
+ }
+
+ // 3. Admission Control: Check the hard limit on AST nodes.
+ if (node_count >= MAX_NODE_LIMIT)
+ return cpp::unexpected(REG_ESPACE);
+
+ // 4. Arena Allocation: If no matching node found, allocate a stable slot.
+ if (!current || current->used == Block::BLOCK_SIZE) {
+ // New blocks are allocated on demand using AllocChecker.
+ AllocChecker ac;
+ Block *new_block = new (ac) Block();
+ if (!ac)
+ return cpp::unexpected(REG_ESPACE);
+ if (!head)
+ head = new_block;
+ if (current)
+ current->next = new_block;
+ current = new_block;
+ }
+
+ // 5. Node Initialisation: Copy the structural definition into the arena.
+ Expr *new_node = ¤t->nodes[current->used++];
+ LIBC_CRASH_ON_NULLPTR(new_node);
+ *new_node = e;
+ hashtable[idx] = new_node;
+ node_count++;
+ return new_node;
+}
+
+cpp::expected<Expr *, int> ExprPool::empty_set() {
+ return intern(Expr(ExprKind::EmptySet));
+}
+cpp::expected<Expr *, int> ExprPool::empty_str() {
+ return intern(Expr(ExprKind::EmptyStr));
+}
+cpp::expected<Expr *, int> ExprPool::make_lit(char c) {
+ return intern(Expr(c));
+}
+
+cpp::expected<Expr *, int> ExprPool::make_concat(Expr *l, Expr *r) {
+ if (!l || !r)
+ return cpp::unexpected(REG_BADPAT);
+ // Apply basic algebraic identities for concatenation:
+ // 1. Ø · R = R · Ø = Ø (Identity: null set)
+ if (l->kind == ExprKind::EmptySet || r->kind == ExprKind::EmptySet)
+ return empty_set();
+ // 2. ε · R = R · ε = R (Identity: empty string)
+ if (l->kind == ExprKind::EmptyStr)
+ return r;
+ if (r->kind == ExprKind::EmptyStr)
+ return l;
+ return intern(Expr(ExprKind::Concat, l, r));
+}
+cpp::expected<Expr *, int> ExprPool::make_alt(Expr *l, Expr *r) {
+ if (!l || !r)
+ return cpp::unexpected(REG_BADPAT);
+ // Apply basic algebraic identities for alternation:
+ // 1. Ø | R = R | Ø = R (Identity: null set)
+ if (l->kind == ExprKind::EmptySet)
+ return r;
+ if (r->kind == ExprKind::EmptySet)
+ return l;
+ // 2. R | R = R (Idempotency)
+ if (l == r)
+ return l;
+ return intern(Expr(ExprKind::Alt, l, r));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/regex/regex_expr_pool.h b/libc/src/__support/regex/regex_expr_pool.h
new file mode 100644
index 0000000000000..8da207aa4e19e
--- /dev/null
+++ b/libc/src/__support/regex/regex_expr_pool.h
@@ -0,0 +1,116 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Pool for Regular Expression AST nodes (Class Definitions).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_EXPR_POOL_H
+#define LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_EXPR_POOL_H
+
+#include "src/__support/CPP/expected.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/regex/regex_ast.h"
+#include <stddef.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+/// An arena-based pool for Regular Expression AST nodes.
+///
+/// This class manages the allocation and hash-consing of Expr nodes. All
+/// nodes created through this pool are owned by it and will be freed when
+/// the pool is destroyed. Hash-consing ensures that identical expressions
+/// are represented by the same pointer, enabling fast comparison and
+/// derivative normalization.
+class ExprPool {
+ /// Internal storage block for AST nodes.
+ ///
+ /// Blocks are allocated on demand to avoid large contiguous allocations
+ /// and are linked together in a list for cleanup.
+ /// TODO: Consider adopting cpp::forward_list for block management once
+ /// it is available in LLVM-libc.
+ struct Block {
+ /// Number of Expr nodes stored in each block.
+ static constexpr size_t BLOCK_SIZE = 256;
+ /// The actual storage for Expr nodes.
+ Expr nodes[BLOCK_SIZE];
+ /// Pointer to the next block in the chain.
+ Block *next = nullptr;
+ /// Number of nodes currently used in this block.
+ size_t used = 0;
+
+ /// Initialises an empty block.
+ Block();
+ };
+
+ /// The first block in the allocation chain.
+ Block *head = nullptr;
+ /// The block currently being used for new node allocations.
+ Block *current = nullptr;
+ /// Total number of nodes allocated across all blocks.
+ size_t node_count = 0;
+
+ /// The size of the hash table used for hash-consing (interning) expression
+ /// nodes. Using a power of two (16,384) allows for efficient indexing using
+ /// bitwise AND instead of modulo. This size is significantly larger than
+ /// MAX_NODE_LIMIT to maintain a low load factor and reduce collisions,
+ /// ensuring O(1) average time for node interning.
+ static constexpr size_t HASH_TABLE_SIZE = 0x4000; // 16,384
+ /// Hash table storing pointers to unique Expr nodes.
+ Expr *hashtable[HASH_TABLE_SIZE];
+
+ /// Core hash-consing function (Interning).
+ ///
+ /// Guarantees that for any two identical structural definitions of an Expr,
+ /// this function will return the same pointer. This enables O(1) structural
+ /// equality via pointer comparison.
+ ///
+ /// \param e A structural definition (proto-node) to intern.
+ /// \returns A pointer to the unique, stable instance in the arena,
+ /// or REG_ESPACE on failure.
+ cpp::expected<Expr *, int> intern(const Expr &e);
+
+ /// The maximum number of nodes allowed in the pool to prevent memory
+ /// exhaustion during compilation of highly complex or maliciously crafted
+ /// regular expressions. A limit of 10,000 nodes provides a sufficient budget
+ /// for most practical regexes while keeping the peak memory footprint
+ /// manageable (approx. 320KB-500KB depending on architecture).
+ static constexpr size_t MAX_NODE_LIMIT = 10000;
+
+public:
+ ExprPool();
+ ~ExprPool();
+
+ // TODO: Use fluent interface (and_then, transform) for these factories once
+ // implemented in cpp::expected.
+
+ /// Returns an EmptySet node.
+ cpp::expected<Expr *, int> empty_set();
+ /// Returns an EmptyStr node.
+ cpp::expected<Expr *, int> empty_str();
+ /// Creates or returns an existing Literal node for the given character.
+ cpp::expected<Expr *, int> make_lit(char c);
+ /// Normalizing factory for Concatenation (L · R).
+ ///
+ /// Applies algebraic simplifications before interning:
+ /// - (Ø · R) or (R · Ø) => Ø
+ /// - (ε · R) or (R · ε) => R
+ cpp::expected<Expr *, int> make_concat(Expr *l, Expr *r);
+
+ /// Normalizing factory for Alternation (L | R).
+ ///
+ /// Applies algebraic simplifications before interning:
+ /// - (Ø | R) or (R | Ø) => R
+ /// - (R | R) => R (Idempotency)
+ cpp::expected<Expr *, int> make_alt(Expr *l, Expr *r);
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_EXPR_POOL_H
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 6b9c1b4ac8cc7..65b3d72961311 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -313,3 +313,4 @@ add_subdirectory(time)
add_subdirectory(threads)
add_subdirectory(wchar)
add_subdirectory(wctype)
+add_subdirectory(regex)
diff --git a/libc/test/src/__support/regex/CMakeLists.txt b/libc/test/src/__support/regex/CMakeLists.txt
new file mode 100644
index 0000000000000..b46b3f6230846
--- /dev/null
+++ b/libc/test/src/__support/regex/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_libc_test(
+ expr_test
+ SUITE
+ libc-support-tests
+ SRCS
+ expr_test.cpp
+ DEPENDS
+ libc.src.__support.regex.regex_expr_pool
+ libc.include.llvm-libc-macros.regex_macros
+)
diff --git a/libc/test/src/__support/regex/expr_test.cpp b/libc/test/src/__support/regex/expr_test.cpp
new file mode 100644
index 0000000000000..efaf7f4fdb61a
--- /dev/null
+++ b/libc/test/src/__support/regex/expr_test.cpp
@@ -0,0 +1,88 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/regex/regex_expr_pool.h"
+#include "src/__support/regex/regex_ast.h"
+#include "hdr/regex_macros.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::Expr;
+using LIBC_NAMESPACE::ExprPool;
+using LIBC_NAMESPACE::ExprKind;
+
+TEST(LlvmLibcRegexExprTest, Interning) {
+ ExprPool pool;
+ auto lit_a = pool.make_lit('a');
+ ASSERT_TRUE(lit_a.has_value());
+ auto lit_a_2 = pool.make_lit('a');
+ ASSERT_TRUE(lit_a_2.has_value());
+ // Hash-consing: same literal should return the same pointer.
+ EXPECT_EQ(lit_a.value(), lit_a_2.value());
+
+ auto lit_b = pool.make_lit('b');
+ ASSERT_TRUE(lit_b.has_value());
+ EXPECT_NE(lit_a.value(), lit_b.value());
+
+ auto empty_set_1 = pool.empty_set();
+ auto empty_set_2 = pool.empty_set();
+ EXPECT_EQ(empty_set_1.value(), empty_set_2.value());
+
+ auto empty_str_1 = pool.empty_str();
+ auto empty_str_2 = pool.empty_str();
+ EXPECT_EQ(empty_str_1.value(), empty_str_2.value());
+}
+
+TEST(LlvmLibcRegexExprTest, AlgebraicIdentitiesConcat) {
+ ExprPool pool;
+ auto lit_a = pool.make_lit('a').value();
+ auto empty_set = pool.empty_set().value();
+ auto empty_str = pool.empty_str().value();
+
+ // (Ø · R) or (R · Ø) => Ø
+ EXPECT_EQ(pool.make_concat(empty_set, lit_a).value(), empty_set);
+ EXPECT_EQ(pool.make_concat(lit_a, empty_set).value(), empty_set);
+
+ // (ε · R) or (R · ε) => R
+ EXPECT_EQ(pool.make_concat(empty_str, lit_a).value(), lit_a);
+ EXPECT_EQ(pool.make_concat(lit_a, empty_str).value(), lit_a);
+}
+
+TEST(LlvmLibcRegexExprTest, AlgebraicIdentitiesAlt) {
+ ExprPool pool;
+ auto lit_a = pool.make_lit('a').value();
+ auto empty_set = pool.empty_set().value();
+
+ // (Ø | R) or (R | Ø) => R
+ EXPECT_EQ(pool.make_alt(empty_set, lit_a).value(), lit_a);
+ EXPECT_EQ(pool.make_alt(lit_a, empty_set).value(), lit_a);
+
+ // (R | R) => R (Idempotency)
+ EXPECT_EQ(pool.make_alt(lit_a, lit_a).value(), lit_a);
+}
+
+TEST(LlvmLibcRegexExprTest, MemoryLimits) {
+ ExprPool pool;
+ // MAX_NODES is 10000. Let's try to exceed it.
+ // We can't easily create 10001 unique literals, so we can use composites.
+ // Actually, make_lit('a') + i might work if we have enough chars?
+ // Wait, there are only 256 chars.
+
+ // We can create unique concatenations.
+ auto lit_a = pool.make_lit('a').value();
+ auto current = lit_a;
+ for (size_t i = 0; i < 9999; ++i) {
+ auto next = pool.make_concat(lit_a, current);
+ ASSERT_TRUE(next.has_value());
+ current = next.value();
+ }
+
+ // Next one should fail.
+ auto fail = pool.make_concat(lit_a, current);
+ ASSERT_FALSE(fail.has_value());
+ EXPECT_EQ(fail.error(), REG_ESPACE);
+}
More information about the libc-commits
mailing list