[libc-commits] [libc] [libc] Add regex AST and ExprPool (PR #198728)

Jeff Bailey via libc-commits libc-commits at lists.llvm.org
Fri May 22 10:20:37 PDT 2026


https://github.com/kaladron updated https://github.com/llvm/llvm-project/pull/198728

>From 2873e14b60bf114d03a26411d4e6a6d61757eaf3 Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey at raspberryginger.com>
Date: Wed, 20 May 2026 09:14:55 +0100
Subject: [PATCH 1/2] [libc] Add regex AST and ExprPool

Implemented the core AST nodes and the ExprPool arena-based allocator.
Utilised AllocChecker for memory safety and enforced hardening at node
initialisation.

Assisted-by: Automated tooling, human reviewed.
---
 libc/src/__support/CMakeLists.txt            |   1 +
 libc/src/__support/regex/CMakeLists.txt      |  12 ++
 libc/src/__support/regex/regex_ast.h         |  82 ++++++++++
 libc/src/__support/regex/regex_expr_pool.cpp | 151 +++++++++++++++++++
 libc/src/__support/regex/regex_expr_pool.h   | 109 +++++++++++++
 5 files changed, 355 insertions(+)
 create mode 100644 libc/src/__support/regex/CMakeLists.txt
 create mode 100644 libc/src/__support/regex/regex_ast.h
 create mode 100644 libc/src/__support/regex/regex_expr_pool.cpp
 create mode 100644 libc/src/__support/regex/regex_expr_pool.h

diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt
index ada489046ef9e..4fd081c554e32 100644
--- a/libc/src/__support/CMakeLists.txt
+++ b/libc/src/__support/CMakeLists.txt
@@ -473,3 +473,4 @@ add_subdirectory(math)
 if(LIBC_COMPILER_HAS_EXT_VECTOR_TYPE)
   add_subdirectory(mathvec)
 endif()
+add_subdirectory(regex)
diff --git a/libc/src/__support/regex/CMakeLists.txt b/libc/src/__support/regex/CMakeLists.txt
new file mode 100644
index 0000000000000..2c90c1c9f0f03
--- /dev/null
+++ b/libc/src/__support/regex/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_object_library(
+  regex_expr_pool
+  SRCS
+    regex_expr_pool.cpp
+  HDRS
+    regex_expr_pool.h
+    regex_ast.h
+  DEPENDS
+    libc.src.__support.CPP.new
+    libc.src.__support.hash
+    libc.src.__support.macros.config
+)
diff --git a/libc/src/__support/regex/regex_ast.h b/libc/src/__support/regex/regex_ast.h
new file mode 100644
index 0000000000000..b80d7a10f43f6
--- /dev/null
+++ b/libc/src/__support/regex/regex_ast.h
@@ -0,0 +1,82 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// AST nodes for Regular Expressions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_AST_H
+#define LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_AST_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+/// Enumeration of Regular Expression AST node types.
+enum class ExprKind {
+  /// Represents the empty set (matches nothing).
+  EmptySet,
+  /// Represents the empty string (matches the empty string).
+  EmptyStr,
+  /// A literal character match.
+  Literal,
+  /// Concatenation of two expressions (left followed by right).
+  Concat,
+  /// Alternation between two expressions (left or right).
+  Alt,
+};
+
+/// A node in the Regular Expression Abstract Syntax Tree.
+///
+/// Expressions are represented as a hash-consed DAG to enable efficient
+/// derivative-based matching. This structure is intended to be managed by
+/// an ExprPool.
+struct Expr {
+  /// The type of this expression node.
+  ExprKind kind;
+  union {
+    /// Character value for Literal nodes.
+    char ch;
+    /// Sub-expressions for Concat and Alt nodes.
+    struct {
+      Expr *left;
+      Expr *right;
+    } bin;
+  };
+
+  /// Default constructor creates an EmptySet node.
+  constexpr Expr() : kind(ExprKind::EmptySet), ch('\0') {}
+  /// Create a node of a specific kind with no data.
+  constexpr Expr(ExprKind k) : kind(k), ch('\0') {}
+  /// Create a Literal node.
+  constexpr Expr(char c) : kind(ExprKind::Literal), ch(c) {}
+  /// Create a binary node (Concat or Alt).
+  constexpr Expr(ExprKind k, Expr *l, Expr *r) : kind(k), bin{l, r} {}
+
+  /// Equivalence check for hash-consing.
+  bool operator==(const Expr &other) const {
+    if (kind != other.kind)
+      return false;
+    switch (kind) {
+    case ExprKind::EmptySet:
+    case ExprKind::EmptyStr:
+      return true;
+    case ExprKind::Literal:
+      return ch == other.ch;
+    case ExprKind::Concat:
+    case ExprKind::Alt:
+      return bin.left == other.bin.left && bin.right == other.bin.right;
+    }
+    return false;
+  }
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_AST_H
diff --git a/libc/src/__support/regex/regex_expr_pool.cpp b/libc/src/__support/regex/regex_expr_pool.cpp
new file mode 100644
index 0000000000000..3cb4d5d6f52f9
--- /dev/null
+++ b/libc/src/__support/regex/regex_expr_pool.cpp
@@ -0,0 +1,151 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Pool for Regular Expression AST nodes.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/regex/regex_expr_pool.h"
+#include "include/llvm-libc-macros/regex-macros.h"
+#include "src/__support/CPP/new.h"
+#include "src/__support/alloc-checker.h"
+#include "src/__support/hash.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/null_check.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+namespace {
+
+// Hash an Expr node for hash-consing.
+uint64_t hash_expr(const Expr &e) {
+  internal::HashState hasher(0x12345678);
+  uint64_t kind = static_cast<uint64_t>(e.kind);
+  hasher.update(&kind, sizeof(kind));
+  switch (e.kind) {
+  case ExprKind::Literal:
+    hasher.update(&e.ch, sizeof(e.ch));
+    break;
+  case ExprKind::Concat:
+  case ExprKind::Alt:
+    hasher.update(&e.bin.left, sizeof(e.bin.left));
+    hasher.update(&e.bin.right, sizeof(e.bin.right));
+    break;
+  default:
+    break;
+  }
+  return hasher.finish();
+}
+
+} // namespace
+
+ExprPool::Block::Block() : next(nullptr), used(0) {}
+
+ExprPool::ExprPool() : head(nullptr), current(nullptr), node_count(0) {
+  for (size_t i = 0; i < HASH_SIZE; ++i)
+    hashtable[i] = nullptr;
+}
+
+ExprPool::~ExprPool() {
+  // TODO: This manual traversal can be simplified once cpp::forward_list
+  // is available for block management.
+  Block *b = head;
+  while (b) {
+    Block *next_b = b->next;
+    delete b;
+    b = next_b;
+  }
+}
+
+cpp::expected<Expr *, int> ExprPool::intern(const Expr &e) {
+  // 1. Calculate the initial bucket for the given structural definition.
+  uint64_t h = hash_expr(e);
+  size_t idx = h % HASH_SIZE;
+
+  // 2. Linear Probing: Search for an existing node with identical content.
+  //    Because pointers are unique, O(1) comparison is guaranteed if
+  //    sub-expressions are already interned.
+  size_t start_idx = idx;
+  while (hashtable[idx]) {
+    if (*hashtable[idx] == e)
+      return hashtable[idx];
+    idx = (idx + 1) % HASH_SIZE;
+    if (idx == start_idx) {
+      // Table full (invariant check: HASH_SIZE >> MAX_NODES)
+      return cpp::unexpected(REG_ESPACE);
+    }
+  }
+
+  // 3. Admission Control: Check the hard limit on AST nodes.
+  if (node_count >= MAX_NODES)
+    return cpp::unexpected(REG_ESPACE);
+
+  // 4. Arena Allocation: If no matching node found, allocate a stable slot.
+  if (!current || current->used == Block::SIZE) {
+    // New blocks are allocated on demand using AllocChecker.
+    AllocChecker ac;
+    Block *new_block = new (ac) Block();
+    if (!ac)
+      return cpp::unexpected(REG_ESPACE);
+    if (!head)
+      head = new_block;
+    if (current)
+      current->next = new_block;
+    current = new_block;
+  }
+
+  // 5. Node Initialization: Copy the structural definition into the arena.
+  Expr *new_node = &current->nodes[current->used++];
+  LIBC_CRASH_ON_NULLPTR(new_node);
+  *new_node = e;
+  hashtable[idx] = new_node;
+  node_count++;
+  return new_node;
+}
+
+cpp::expected<Expr *, int> ExprPool::empty_set() {
+  return intern(Expr(ExprKind::EmptySet));
+}
+cpp::expected<Expr *, int> ExprPool::empty_str() {
+  return intern(Expr(ExprKind::EmptyStr));
+}
+cpp::expected<Expr *, int> ExprPool::make_lit(char c) {
+  return intern(Expr(c));
+}
+
+cpp::expected<Expr *, int> ExprPool::make_concat(Expr *l, Expr *r) {
+  if (!l || !r)
+    return cpp::unexpected(REG_BADPAT);
+  // Apply basic algebraic identities for concatenation:
+  // 1. Ø · R = R · Ø = Ø (Identity: null set)
+  if (l->kind == ExprKind::EmptySet || r->kind == ExprKind::EmptySet)
+    return empty_set();
+  // 2. ε · R = R · ε = R (Identity: empty string)
+  if (l->kind == ExprKind::EmptyStr)
+    return r;
+  if (r->kind == ExprKind::EmptyStr)
+    return l;
+  return intern(Expr(ExprKind::Concat, l, r));
+}
+cpp::expected<Expr *, int> ExprPool::make_alt(Expr *l, Expr *r) {
+  if (!l || !r)
+    return cpp::unexpected(REG_BADPAT);
+  // Apply basic algebraic identities for alternation:
+  // 1. Ø | R = R | Ø = R (Identity: null set)
+  if (l->kind == ExprKind::EmptySet)
+    return r;
+  if (r->kind == ExprKind::EmptySet)
+    return l;
+  // 2. R | R = R (Idempotency)
+  if (l == r)
+    return l;
+  return intern(Expr(ExprKind::Alt, l, r));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/regex/regex_expr_pool.h b/libc/src/__support/regex/regex_expr_pool.h
new file mode 100644
index 0000000000000..bf9c15f7756ce
--- /dev/null
+++ b/libc/src/__support/regex/regex_expr_pool.h
@@ -0,0 +1,109 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Pool for Regular Expression AST nodes.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_EXPR_POOL_H
+#define LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_EXPR_POOL_H
+
+#include "src/__support/CPP/expected.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/regex/regex_ast.h"
+#include <stddef.h>
+
+namespace LIBC_NAMESPACE_DECL {
+
+/// An arena-based pool for Regular Expression AST nodes.
+///
+/// This class manages the allocation and hash-consing of Expr nodes. All
+/// nodes created through this pool are owned by it and will be freed when
+/// the pool is destroyed. Hash-consing ensures that identical expressions
+/// are represented by the same pointer, enabling fast comparison and
+/// derivative normalization.
+class ExprPool {
+  /// Internal storage block for AST nodes.
+  ///
+  /// Blocks are allocated on demand to avoid large contiguous allocations
+  /// and are linked together in a list for cleanup.
+  /// TODO: Consider adopting cpp::forward_list for block management once
+  /// it is available in LLVM-libc.
+  struct Block {
+    /// Number of Expr nodes stored in each block.
+    static constexpr size_t SIZE = 256;
+    /// The actual storage for Expr nodes.
+    Expr nodes[SIZE];
+    /// Pointer to the next block in the chain.
+    Block *next = nullptr;
+    /// Number of nodes currently used in this block.
+    size_t used = 0;
+
+    /// Initializes an empty block.
+    Block();
+  };
+
+  /// The first block in the allocation chain.
+  Block *head = nullptr;
+  /// The block currently being used for new node allocations.
+  Block *current = nullptr;
+  /// Total number of nodes allocated across all blocks.
+  size_t node_count = 0;
+
+  /// Size of the hash table used for hash-consing.
+  /// Must be significantly larger than MAX_NODES to maintain efficiency.
+  static constexpr size_t HASH_SIZE = 16384;
+  /// Hash table storing pointers to unique Expr nodes.
+  Expr *hashtable[HASH_SIZE];
+
+  /// Core hash-consing function (Interning).
+  ///
+  /// Guarantees that for any two identical structural definitions of an Expr,
+  /// this function will return the same pointer. This enables O(1) structural
+  /// equality via pointer comparison.
+  ///
+  /// \param e A structural definition (proto-node) to intern.
+  /// \returns A pointer to the unique, stable instance in the arena,
+  ///          or REG_ESPACE on failure.
+  cpp::expected<Expr *, int> intern(const Expr &e);
+
+  /// Maximum number of nodes allowed in the pool to prevent memory exhaustion.
+  static constexpr size_t MAX_NODES = 10000;
+
+public:
+  ExprPool();
+  ~ExprPool();
+
+  // TODO: Use fluent interface (and_then, transform) for these factories once
+  // implemented in cpp::expected.
+
+  /// Returns an EmptySet node.
+  cpp::expected<Expr *, int> empty_set();
+  /// Returns an EmptyStr node.
+  cpp::expected<Expr *, int> empty_str();
+  /// Creates or returns an existing Literal node for the given character.
+  cpp::expected<Expr *, int> make_lit(char c);
+  /// Normalizing factory for Concatenation (L · R).
+  ///
+  /// Applies algebraic simplifications before interning:
+  /// - (Ø · R) or (R · Ø) => Ø
+  /// - (ε · R) or (R · ε) => R
+  cpp::expected<Expr *, int> make_concat(Expr *l, Expr *r);
+
+  /// Normalizing factory for Alternation (L | R).
+  ///
+  /// Applies algebraic simplifications before interning:
+  /// - (Ø | R) or (R | Ø) => R
+  /// - (R | R) => R (Idempotency)
+  cpp::expected<Expr *, int> make_alt(Expr *l, Expr *r);
+};
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_REGEX_REGEX_EXPR_POOL_H

>From 60f3eb8e5d8355d62b2958f89de8a1c893df029b Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey at raspberryginger.com>
Date: Fri, 22 May 2026 16:57:06 +0100
Subject: [PATCH 2/2] [libc] Add regex AST and ExprPool with staged testing

Implemented the core AST nodes and the ExprPool arena-based allocator.
Integrated internal unit tests for AST algebraic identities and pool
memory limits. Adopted the hdr/ proxy pattern for regex macros and
standardised file documentation.

Assisted-by: Automated tooling, human reviewed.
---
 libc/hdr/CMakeLists.txt                      |  1 +
 libc/hdr/regex_macros.h                      |  4 +
 libc/src/__support/regex/CMakeLists.txt      |  3 +
 libc/src/__support/regex/regex_ast.h         |  2 +-
 libc/src/__support/regex/regex_expr_pool.cpp | 18 ++--
 libc/src/__support/regex/regex_expr_pool.h   | 27 +++---
 libc/test/src/__support/CMakeLists.txt       |  1 +
 libc/test/src/__support/regex/CMakeLists.txt | 10 +++
 libc/test/src/__support/regex/expr_test.cpp  | 88 ++++++++++++++++++++
 9 files changed, 134 insertions(+), 20 deletions(-)
 create mode 100644 libc/test/src/__support/regex/CMakeLists.txt
 create mode 100644 libc/test/src/__support/regex/expr_test.cpp

diff --git a/libc/hdr/CMakeLists.txt b/libc/hdr/CMakeLists.txt
index 8da1e2a2a1872..eb79298ae84cf 100644
--- a/libc/hdr/CMakeLists.txt
+++ b/libc/hdr/CMakeLists.txt
@@ -382,6 +382,7 @@ add_proxy_header_library(
     regex_macros.h
   FULL_BUILD_DEPENDS
     libc.include.llvm-libc-macros.regex_macros
+    libc.include.regex
 )
 
 add_subdirectory(types)
diff --git a/libc/hdr/regex_macros.h b/libc/hdr/regex_macros.h
index 74b5d4be20ff5..c188e0fa2588d 100644
--- a/libc/hdr/regex_macros.h
+++ b/libc/hdr/regex_macros.h
@@ -18,6 +18,10 @@
 
 #include "include/llvm-libc-macros/regex-macros.h"
 
+#else // Overlay mode
+
+#include <regex.h>
+
 #endif // LIBC_FULL_BUILD
 
 #endif // LLVM_LIBC_HDR_REGEX_MACROS_H
diff --git a/libc/src/__support/regex/CMakeLists.txt b/libc/src/__support/regex/CMakeLists.txt
index 2c90c1c9f0f03..4679b7b7f2ece 100644
--- a/libc/src/__support/regex/CMakeLists.txt
+++ b/libc/src/__support/regex/CMakeLists.txt
@@ -6,7 +6,10 @@ add_object_library(
     regex_expr_pool.h
     regex_ast.h
   DEPENDS
+    libc.src.__support.CPP.expected
     libc.src.__support.CPP.new
+    libc.src.__support.alloc_checker
     libc.src.__support.hash
     libc.src.__support.macros.config
+    libc.src.__support.macros.null_check
 )
diff --git a/libc/src/__support/regex/regex_ast.h b/libc/src/__support/regex/regex_ast.h
index b80d7a10f43f6..db8485876ad64 100644
--- a/libc/src/__support/regex/regex_ast.h
+++ b/libc/src/__support/regex/regex_ast.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// AST nodes for Regular Expressions.
+/// AST nodes for Regular Expressions (Class Definitions).
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/libc/src/__support/regex/regex_expr_pool.cpp b/libc/src/__support/regex/regex_expr_pool.cpp
index 3cb4d5d6f52f9..6ee315387788b 100644
--- a/libc/src/__support/regex/regex_expr_pool.cpp
+++ b/libc/src/__support/regex/regex_expr_pool.cpp
@@ -7,12 +7,12 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// Pool for Regular Expression AST nodes.
+/// Pool for Regular Expression AST nodes (Implementation).
 ///
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/regex/regex_expr_pool.h"
-#include "include/llvm-libc-macros/regex-macros.h"
+#include "hdr/regex_macros.h"
 #include "src/__support/CPP/new.h"
 #include "src/__support/alloc-checker.h"
 #include "src/__support/hash.h"
@@ -48,7 +48,7 @@ uint64_t hash_expr(const Expr &e) {
 ExprPool::Block::Block() : next(nullptr), used(0) {}
 
 ExprPool::ExprPool() : head(nullptr), current(nullptr), node_count(0) {
-  for (size_t i = 0; i < HASH_SIZE; ++i)
+  for (size_t i = 0; i < HASH_TABLE_SIZE; ++i)
     hashtable[i] = nullptr;
 }
 
@@ -66,7 +66,7 @@ ExprPool::~ExprPool() {
 cpp::expected<Expr *, int> ExprPool::intern(const Expr &e) {
   // 1. Calculate the initial bucket for the given structural definition.
   uint64_t h = hash_expr(e);
-  size_t idx = h % HASH_SIZE;
+  size_t idx = h & (HASH_TABLE_SIZE - 1);
 
   // 2. Linear Probing: Search for an existing node with identical content.
   //    Because pointers are unique, O(1) comparison is guaranteed if
@@ -75,19 +75,19 @@ cpp::expected<Expr *, int> ExprPool::intern(const Expr &e) {
   while (hashtable[idx]) {
     if (*hashtable[idx] == e)
       return hashtable[idx];
-    idx = (idx + 1) % HASH_SIZE;
+    idx = (idx + 1) & (HASH_TABLE_SIZE - 1);
     if (idx == start_idx) {
-      // Table full (invariant check: HASH_SIZE >> MAX_NODES)
+      // Table full (invariant check: HASH_TABLE_SIZE >> MAX_NODE_LIMIT)
       return cpp::unexpected(REG_ESPACE);
     }
   }
 
   // 3. Admission Control: Check the hard limit on AST nodes.
-  if (node_count >= MAX_NODES)
+  if (node_count >= MAX_NODE_LIMIT)
     return cpp::unexpected(REG_ESPACE);
 
   // 4. Arena Allocation: If no matching node found, allocate a stable slot.
-  if (!current || current->used == Block::SIZE) {
+  if (!current || current->used == Block::BLOCK_SIZE) {
     // New blocks are allocated on demand using AllocChecker.
     AllocChecker ac;
     Block *new_block = new (ac) Block();
@@ -100,7 +100,7 @@ cpp::expected<Expr *, int> ExprPool::intern(const Expr &e) {
     current = new_block;
   }
 
-  // 5. Node Initialization: Copy the structural definition into the arena.
+  // 5. Node Initialisation: Copy the structural definition into the arena.
   Expr *new_node = &current->nodes[current->used++];
   LIBC_CRASH_ON_NULLPTR(new_node);
   *new_node = e;
diff --git a/libc/src/__support/regex/regex_expr_pool.h b/libc/src/__support/regex/regex_expr_pool.h
index bf9c15f7756ce..8da207aa4e19e 100644
--- a/libc/src/__support/regex/regex_expr_pool.h
+++ b/libc/src/__support/regex/regex_expr_pool.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// Pool for Regular Expression AST nodes.
+/// Pool for Regular Expression AST nodes (Class Definitions).
 ///
 //===----------------------------------------------------------------------===//
 
@@ -37,15 +37,15 @@ class ExprPool {
   /// it is available in LLVM-libc.
   struct Block {
     /// Number of Expr nodes stored in each block.
-    static constexpr size_t SIZE = 256;
+    static constexpr size_t BLOCK_SIZE = 256;
     /// The actual storage for Expr nodes.
-    Expr nodes[SIZE];
+    Expr nodes[BLOCK_SIZE];
     /// Pointer to the next block in the chain.
     Block *next = nullptr;
     /// Number of nodes currently used in this block.
     size_t used = 0;
 
-    /// Initializes an empty block.
+    /// Initialises an empty block.
     Block();
   };
 
@@ -56,11 +56,14 @@ class ExprPool {
   /// Total number of nodes allocated across all blocks.
   size_t node_count = 0;
 
-  /// Size of the hash table used for hash-consing.
-  /// Must be significantly larger than MAX_NODES to maintain efficiency.
-  static constexpr size_t HASH_SIZE = 16384;
+  /// The size of the hash table used for hash-consing (interning) expression
+  /// nodes. Using a power of two (16,384) allows for efficient indexing using
+  /// bitwise AND instead of modulo. This size is significantly larger than
+  /// MAX_NODE_LIMIT to maintain a low load factor and reduce collisions,
+  /// ensuring O(1) average time for node interning.
+  static constexpr size_t HASH_TABLE_SIZE = 0x4000; // 16,384
   /// Hash table storing pointers to unique Expr nodes.
-  Expr *hashtable[HASH_SIZE];
+  Expr *hashtable[HASH_TABLE_SIZE];
 
   /// Core hash-consing function (Interning).
   ///
@@ -73,8 +76,12 @@ class ExprPool {
   ///          or REG_ESPACE on failure.
   cpp::expected<Expr *, int> intern(const Expr &e);
 
-  /// Maximum number of nodes allowed in the pool to prevent memory exhaustion.
-  static constexpr size_t MAX_NODES = 10000;
+  /// The maximum number of nodes allowed in the pool to prevent memory
+  /// exhaustion during compilation of highly complex or maliciously crafted
+  /// regular expressions. A limit of 10,000 nodes provides a sufficient budget
+  /// for most practical regexes while keeping the peak memory footprint
+  /// manageable (approx. 320KB-500KB depending on architecture).
+  static constexpr size_t MAX_NODE_LIMIT = 10000;
 
 public:
   ExprPool();
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 6b9c1b4ac8cc7..65b3d72961311 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -313,3 +313,4 @@ add_subdirectory(time)
 add_subdirectory(threads)
 add_subdirectory(wchar)
 add_subdirectory(wctype)
+add_subdirectory(regex)
diff --git a/libc/test/src/__support/regex/CMakeLists.txt b/libc/test/src/__support/regex/CMakeLists.txt
new file mode 100644
index 0000000000000..b46b3f6230846
--- /dev/null
+++ b/libc/test/src/__support/regex/CMakeLists.txt
@@ -0,0 +1,10 @@
+add_libc_test(
+  expr_test
+  SUITE
+    libc-support-tests
+  SRCS
+    expr_test.cpp
+  DEPENDS
+    libc.src.__support.regex.regex_expr_pool
+    libc.include.llvm-libc-macros.regex_macros
+)
diff --git a/libc/test/src/__support/regex/expr_test.cpp b/libc/test/src/__support/regex/expr_test.cpp
new file mode 100644
index 0000000000000..efaf7f4fdb61a
--- /dev/null
+++ b/libc/test/src/__support/regex/expr_test.cpp
@@ -0,0 +1,88 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/regex/regex_expr_pool.h"
+#include "src/__support/regex/regex_ast.h"
+#include "hdr/regex_macros.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::Expr;
+using LIBC_NAMESPACE::ExprPool;
+using LIBC_NAMESPACE::ExprKind;
+
+TEST(LlvmLibcRegexExprTest, Interning) {
+  ExprPool pool;
+  auto lit_a = pool.make_lit('a');
+  ASSERT_TRUE(lit_a.has_value());
+  auto lit_a_2 = pool.make_lit('a');
+  ASSERT_TRUE(lit_a_2.has_value());
+  // Hash-consing: same literal should return the same pointer.
+  EXPECT_EQ(lit_a.value(), lit_a_2.value());
+
+  auto lit_b = pool.make_lit('b');
+  ASSERT_TRUE(lit_b.has_value());
+  EXPECT_NE(lit_a.value(), lit_b.value());
+
+  auto empty_set_1 = pool.empty_set();
+  auto empty_set_2 = pool.empty_set();
+  EXPECT_EQ(empty_set_1.value(), empty_set_2.value());
+
+  auto empty_str_1 = pool.empty_str();
+  auto empty_str_2 = pool.empty_str();
+  EXPECT_EQ(empty_str_1.value(), empty_str_2.value());
+}
+
+TEST(LlvmLibcRegexExprTest, AlgebraicIdentitiesConcat) {
+  ExprPool pool;
+  auto lit_a = pool.make_lit('a').value();
+  auto empty_set = pool.empty_set().value();
+  auto empty_str = pool.empty_str().value();
+
+  // (Ø · R) or (R · Ø) => Ø
+  EXPECT_EQ(pool.make_concat(empty_set, lit_a).value(), empty_set);
+  EXPECT_EQ(pool.make_concat(lit_a, empty_set).value(), empty_set);
+
+  // (ε · R) or (R · ε) => R
+  EXPECT_EQ(pool.make_concat(empty_str, lit_a).value(), lit_a);
+  EXPECT_EQ(pool.make_concat(lit_a, empty_str).value(), lit_a);
+}
+
+TEST(LlvmLibcRegexExprTest, AlgebraicIdentitiesAlt) {
+  ExprPool pool;
+  auto lit_a = pool.make_lit('a').value();
+  auto empty_set = pool.empty_set().value();
+
+  // (Ø | R) or (R | Ø) => R
+  EXPECT_EQ(pool.make_alt(empty_set, lit_a).value(), lit_a);
+  EXPECT_EQ(pool.make_alt(lit_a, empty_set).value(), lit_a);
+
+  // (R | R) => R (Idempotency)
+  EXPECT_EQ(pool.make_alt(lit_a, lit_a).value(), lit_a);
+}
+
+TEST(LlvmLibcRegexExprTest, MemoryLimits) {
+  ExprPool pool;
+  // MAX_NODES is 10000. Let's try to exceed it.
+  // We can't easily create 10001 unique literals, so we can use composites.
+  // Actually, make_lit('a') + i might work if we have enough chars? 
+  // Wait, there are only 256 chars.
+  
+  // We can create unique concatenations.
+  auto lit_a = pool.make_lit('a').value();
+  auto current = lit_a;
+  for (size_t i = 0; i < 9999; ++i) {
+    auto next = pool.make_concat(lit_a, current);
+    ASSERT_TRUE(next.has_value());
+    current = next.value();
+  }
+
+  // Next one should fail.
+  auto fail = pool.make_concat(lit_a, current);
+  ASSERT_FALSE(fail.has_value());
+  EXPECT_EQ(fail.error(), REG_ESPACE);
+}



More information about the libc-commits mailing list