[llvm] [regex][FileCheck] Support back-references up to 20. (PR #174150)

Thomas Symalla via llvm-commits llvm-commits at lists.llvm.org
Fri Jan 2 03:07:27 PST 2026


https://github.com/tsymalla updated https://github.com/llvm/llvm-project/pull/174150

>From f2ced2104776a3b0c70bd38b5d1a6379ab62f42b Mon Sep 17 00:00:00 2001
From: Thomas Symalla <github at thomassymalla.de>
Date: Tue, 30 Dec 2025 08:50:46 +0100
Subject: [PATCH 1/2] [regex][FileCheck] Support back-references up to 20.

LLVM RegEx already supports `\g<n>`-like syntax for `.sub()`. Support
this in `regcomp` as well by increasing the limit and adding additional
parsing. Update the limit checks in FileCheck. The new limit is 20,
which can be theoretically `realloced` but I don't find a use case for
that as of now.
Update a test that now should pass when using more than 9
back-references.
Add a new test that tests for the error message explicitly..
---
 llvm/lib/FileCheck/FileCheck.cpp      | 19 ++++++++---
 llvm/lib/Support/Regex.cpp            |  2 +-
 llvm/lib/Support/regcomp.c            | 47 ++++++++++++++++++---------
 llvm/test/FileCheck/backref-limit.txt |  7 ++++
 llvm/test/FileCheck/capture-limit.txt |  8 ++---
 5 files changed, 57 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/FileCheck/backref-limit.txt

diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index 9245db442611c..e77f82b569c8b 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -28,6 +28,10 @@
 
 using namespace llvm;
 
+namespace {
+  constexpr int BACKREF_LIMIT = 20;
+} // anonymous namespace
+
 StringRef ExpressionFormat::toString() const {
   switch (Value) {
   case Kind::NoFormat:
@@ -1054,10 +1058,11 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
         if (!IsNumBlock &&
             (It = VariableDefs.find(SubstStr)) != VariableDefs.end()) {
           unsigned CaptureParenGroup = It->second;
-          if (CaptureParenGroup < 1 || CaptureParenGroup > 9) {
+          if (CaptureParenGroup < 1 || CaptureParenGroup > BACKREF_LIMIT) {
             SM.PrintMessage(SMLoc::getFromPointer(SubstStr.data()),
                             SourceMgr::DK_Error,
-                            "Can't back-reference more than 9 variables");
+                            "Can't back-reference more than " +
+                                Twine(BACKREF_LIMIT) + " variables");
             return true;
           }
           AddBackrefToRegEx(CaptureParenGroup);
@@ -1108,8 +1113,14 @@ bool Pattern::AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM) {
 }
 
 void Pattern::AddBackrefToRegEx(unsigned BackrefNum) {
-  assert(BackrefNum >= 1 && BackrefNum <= 9 && "Invalid backref number");
-  std::string Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
+  assert(BackrefNum >= 1 && BackrefNum <= BACKREF_LIMIT &&
+         "Invalid backref number");
+  std::string Backref;
+  if (BackrefNum >= 1 && BackrefNum <= 9)
+    Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
+  else
+    Backref = std::string("\\g<") + std::to_string(BackrefNum) + '>';
+
   RegExStr += Backref;
 }
 
diff --git a/llvm/lib/Support/Regex.cpp b/llvm/lib/Support/Regex.cpp
index 5eedf95c48e37..5a96f1974341d 100644
--- a/llvm/lib/Support/Regex.cpp
+++ b/llvm/lib/Support/Regex.cpp
@@ -154,7 +154,7 @@ std::string Regex::sub(StringRef Repl, StringRef String,
     // Add the skipped substring.
     Res += Split.first;
 
-    // Check for terminimation and trailing backslash.
+    // Check for termination and trailing backslash.
     if (Split.second.empty()) {
       if (Repl.size() != Split.first.size() &&
           Error && Error->empty())
diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index f5c47781f3d82..72095c31d12b3 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -192,7 +192,7 @@ struct parse {
   sopno slen;       /* malloced strip length (used) */
   int ncsalloc;     /* number of csets allocated */
   struct re_guts *g;
-#define NPAREN 10       /* we need to remember () 1-9 for back refs */
+#define NPAREN 21       /* we need to remember () 1-20 for back refs */
   sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
   sopno pend[NPAREN];   /* -> ) ([0] unused) */
 };
@@ -506,27 +506,42 @@ static void p_ere_exp(struct parse *p) {
        * least 4 matching groups specified in the pattern previously).
        */
       backrefnum = c - '0';
-      if (p->pend[backrefnum] == 0) {
-        SETERROR(REG_ESUBREG);
-        break;
-      }
-
-      /* Make sure everything checks out and emit the sequence
-       * that marks a back-reference to the parse structure.
+    } else if (c == 'g') {
+      /* Support back-references with index greater 9.
+       * These look like that: \g<n>.
+       * Extract the number inside the brackets.
        */
-      assert(backrefnum <= p->g->nsub);
-      EMIT(OBACK_, backrefnum);
-      assert(p->pbegin[backrefnum] != 0);
-      assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
-      assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
-      (void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
-      EMIT(O_BACK, backrefnum);
-      p->g->backrefs = 1;
+      MUSTEAT('<', REG_BADRPT);
+
+      backrefnum = 0;
+      while (MORE() && isdigit(PEEK()) && PEEK() != '>') {
+        c = GETNEXT();
+        backrefnum = backrefnum * 10 + c - '0';
+      }
+      MUSTEAT('>', REG_BADRPT);
     } else {
       /* Other chars are simply themselves when escaped with a backslash.
        */
       ordinary(p, c);
+      break;
+    }
+
+    if (p->pend[backrefnum] == 0) {
+      SETERROR(REG_ESUBREG);
+      break;
     }
+
+    /* Make sure everything checks out and emit the sequence
+     * that marks a back-reference to the parse structure.
+     */
+    assert(backrefnum <= p->g->nsub);
+    EMIT(OBACK_, backrefnum);
+    assert(p->pbegin[backrefnum] != 0);
+    assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
+    assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
+    (void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
+    EMIT(O_BACK, backrefnum);
+    p->g->backrefs = 1;
     break;
   case '{': /* okay as ordinary except if digit follows */
     REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
diff --git a/llvm/test/FileCheck/backref-limit.txt b/llvm/test/FileCheck/backref-limit.txt
new file mode 100644
index 0000000000000..890630e6e471a
--- /dev/null
+++ b/llvm/test/FileCheck/backref-limit.txt
@@ -0,0 +1,7 @@
+; RUN: not FileCheck -check-prefix=CHECK-BACKREF %s < /dev/null 2>&1 | FileCheck -check-prefix=ERR-CHECK-BACKREF %s
+
+; ERR-CHECK-BACKREF: error: Can't back-reference more than 20 variables
+
+r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21
+
+; CHECK-BACKREF: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG11:r10]] [[REG12:r11]] [[REG13:r12]] [[REG14:r13]] [[REG15:r14]] [[REG16:r15]] [[REG17:r16]] [[REG18:r17]] [[REG19:r18]] [[REG20:r19]] [[REG21:r20]] [[REG21]]
diff --git a/llvm/test/FileCheck/capture-limit.txt b/llvm/test/FileCheck/capture-limit.txt
index a727be0c781f1..e7e50992201d1 100644
--- a/llvm/test/FileCheck/capture-limit.txt
+++ b/llvm/test/FileCheck/capture-limit.txt
@@ -1,8 +1,6 @@
 ; RUN: FileCheck -input-file %s %s
-; XFAIL: *
+; XPASS: *
 
-; Trying to back-reference more than 9 variables is intended to fail.
+r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9 r8
 
-r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9
-
-; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]]
+; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]] [[REG9]]

>From 9a69659f214e9a1b7705cde68f539726b81886f0 Mon Sep 17 00:00:00 2001
From: Thomas Symalla <github at thomassymalla.de>
Date: Fri, 2 Jan 2026 12:07:00 +0100
Subject: [PATCH 2/2] [FileCheck] Re-run clang-format

---
 llvm/lib/FileCheck/FileCheck.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index e77f82b569c8b..2bfef104c6a9c 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -29,7 +29,7 @@
 using namespace llvm;
 
 namespace {
-  constexpr int BACKREF_LIMIT = 20;
+constexpr int BACKREF_LIMIT = 20;
 } // anonymous namespace
 
 StringRef ExpressionFormat::toString() const {



More information about the llvm-commits mailing list