[llvm] [regex][FileCheck] Support back-references up to 20. (PR #174150)
Thomas Symalla via llvm-commits
llvm-commits at lists.llvm.org
Fri Jan 2 03:07:27 PST 2026
https://github.com/tsymalla updated https://github.com/llvm/llvm-project/pull/174150
>From f2ced2104776a3b0c70bd38b5d1a6379ab62f42b Mon Sep 17 00:00:00 2001
From: Thomas Symalla <github at thomassymalla.de>
Date: Tue, 30 Dec 2025 08:50:46 +0100
Subject: [PATCH 1/2] [regex][FileCheck] Support back-references up to 20.
LLVM RegEx already supports `\g<n>`-like syntax for `.sub()`. Support
this in `regcomp` as well by increasing the limit and adding additional
parsing. Update the limit checks in FileCheck. The new limit is 20,
which can be theoretically `realloced` but I don't find a use case for
that as of now.
Update a test that now should pass when using more than 9
back-references.
Add a new test that tests for the error message explicitly..
---
llvm/lib/FileCheck/FileCheck.cpp | 19 ++++++++---
llvm/lib/Support/Regex.cpp | 2 +-
llvm/lib/Support/regcomp.c | 47 ++++++++++++++++++---------
llvm/test/FileCheck/backref-limit.txt | 7 ++++
llvm/test/FileCheck/capture-limit.txt | 8 ++---
5 files changed, 57 insertions(+), 26 deletions(-)
create mode 100644 llvm/test/FileCheck/backref-limit.txt
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index 9245db442611c..e77f82b569c8b 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -28,6 +28,10 @@
using namespace llvm;
+namespace {
+ constexpr int BACKREF_LIMIT = 20;
+} // anonymous namespace
+
StringRef ExpressionFormat::toString() const {
switch (Value) {
case Kind::NoFormat:
@@ -1054,10 +1058,11 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
if (!IsNumBlock &&
(It = VariableDefs.find(SubstStr)) != VariableDefs.end()) {
unsigned CaptureParenGroup = It->second;
- if (CaptureParenGroup < 1 || CaptureParenGroup > 9) {
+ if (CaptureParenGroup < 1 || CaptureParenGroup > BACKREF_LIMIT) {
SM.PrintMessage(SMLoc::getFromPointer(SubstStr.data()),
SourceMgr::DK_Error,
- "Can't back-reference more than 9 variables");
+ "Can't back-reference more than " +
+ Twine(BACKREF_LIMIT) + " variables");
return true;
}
AddBackrefToRegEx(CaptureParenGroup);
@@ -1108,8 +1113,14 @@ bool Pattern::AddRegExToRegEx(StringRef RS, unsigned &CurParen, SourceMgr &SM) {
}
void Pattern::AddBackrefToRegEx(unsigned BackrefNum) {
- assert(BackrefNum >= 1 && BackrefNum <= 9 && "Invalid backref number");
- std::string Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
+ assert(BackrefNum >= 1 && BackrefNum <= BACKREF_LIMIT &&
+ "Invalid backref number");
+ std::string Backref;
+ if (BackrefNum >= 1 && BackrefNum <= 9)
+ Backref = std::string("\\") + std::string(1, '0' + BackrefNum);
+ else
+ Backref = std::string("\\g<") + std::to_string(BackrefNum) + '>';
+
RegExStr += Backref;
}
diff --git a/llvm/lib/Support/Regex.cpp b/llvm/lib/Support/Regex.cpp
index 5eedf95c48e37..5a96f1974341d 100644
--- a/llvm/lib/Support/Regex.cpp
+++ b/llvm/lib/Support/Regex.cpp
@@ -154,7 +154,7 @@ std::string Regex::sub(StringRef Repl, StringRef String,
// Add the skipped substring.
Res += Split.first;
- // Check for terminimation and trailing backslash.
+ // Check for termination and trailing backslash.
if (Split.second.empty()) {
if (Repl.size() != Split.first.size() &&
Error && Error->empty())
diff --git a/llvm/lib/Support/regcomp.c b/llvm/lib/Support/regcomp.c
index f5c47781f3d82..72095c31d12b3 100644
--- a/llvm/lib/Support/regcomp.c
+++ b/llvm/lib/Support/regcomp.c
@@ -192,7 +192,7 @@ struct parse {
sopno slen; /* malloced strip length (used) */
int ncsalloc; /* number of csets allocated */
struct re_guts *g;
-#define NPAREN 10 /* we need to remember () 1-9 for back refs */
+#define NPAREN 21 /* we need to remember () 1-20 for back refs */
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
sopno pend[NPAREN]; /* -> ) ([0] unused) */
};
@@ -506,27 +506,42 @@ static void p_ere_exp(struct parse *p) {
* least 4 matching groups specified in the pattern previously).
*/
backrefnum = c - '0';
- if (p->pend[backrefnum] == 0) {
- SETERROR(REG_ESUBREG);
- break;
- }
-
- /* Make sure everything checks out and emit the sequence
- * that marks a back-reference to the parse structure.
+ } else if (c == 'g') {
+ /* Support back-references with index greater 9.
+ * These look like that: \g<n>.
+ * Extract the number inside the brackets.
*/
- assert(backrefnum <= p->g->nsub);
- EMIT(OBACK_, backrefnum);
- assert(p->pbegin[backrefnum] != 0);
- assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
- assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
- (void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
- EMIT(O_BACK, backrefnum);
- p->g->backrefs = 1;
+ MUSTEAT('<', REG_BADRPT);
+
+ backrefnum = 0;
+ while (MORE() && isdigit(PEEK()) && PEEK() != '>') {
+ c = GETNEXT();
+ backrefnum = backrefnum * 10 + c - '0';
+ }
+ MUSTEAT('>', REG_BADRPT);
} else {
/* Other chars are simply themselves when escaped with a backslash.
*/
ordinary(p, c);
+ break;
+ }
+
+ if (p->pend[backrefnum] == 0) {
+ SETERROR(REG_ESUBREG);
+ break;
}
+
+ /* Make sure everything checks out and emit the sequence
+ * that marks a back-reference to the parse structure.
+ */
+ assert(backrefnum <= p->g->nsub);
+ EMIT(OBACK_, backrefnum);
+ assert(p->pbegin[backrefnum] != 0);
+ assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN);
+ assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN);
+ (void)dupl(p, p->pbegin[backrefnum] + 1, p->pend[backrefnum]);
+ EMIT(O_BACK, backrefnum);
+ p->g->backrefs = 1;
break;
case '{': /* okay as ordinary except if digit follows */
REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
diff --git a/llvm/test/FileCheck/backref-limit.txt b/llvm/test/FileCheck/backref-limit.txt
new file mode 100644
index 0000000000000..890630e6e471a
--- /dev/null
+++ b/llvm/test/FileCheck/backref-limit.txt
@@ -0,0 +1,7 @@
+; RUN: not FileCheck -check-prefix=CHECK-BACKREF %s < /dev/null 2>&1 | FileCheck -check-prefix=ERR-CHECK-BACKREF %s
+
+; ERR-CHECK-BACKREF: error: Can't back-reference more than 20 variables
+
+r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r13 r14 r15 r16 r17 r18 r19 r20 r21
+
+; CHECK-BACKREF: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG11:r10]] [[REG12:r11]] [[REG13:r12]] [[REG14:r13]] [[REG15:r14]] [[REG16:r15]] [[REG17:r16]] [[REG18:r17]] [[REG19:r18]] [[REG20:r19]] [[REG21:r20]] [[REG21]]
diff --git a/llvm/test/FileCheck/capture-limit.txt b/llvm/test/FileCheck/capture-limit.txt
index a727be0c781f1..e7e50992201d1 100644
--- a/llvm/test/FileCheck/capture-limit.txt
+++ b/llvm/test/FileCheck/capture-limit.txt
@@ -1,8 +1,6 @@
; RUN: FileCheck -input-file %s %s
-; XFAIL: *
+; XPASS: *
-; Trying to back-reference more than 9 variables is intended to fail.
+r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9 r8
-r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r9
-
-; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]]
+; CHECK: [[REG1:r0]] [[REG2:r1]] [[REG3:r2]] [[REG4:r3]] [[REG5:r4]] [[REG6:r5]] [[REG7:r6]] [[REG8:r7]] [[REG9:r8]] [[REG10:r9]] [[REG10]] [[REG9]]
>From 9a69659f214e9a1b7705cde68f539726b81886f0 Mon Sep 17 00:00:00 2001
From: Thomas Symalla <github at thomassymalla.de>
Date: Fri, 2 Jan 2026 12:07:00 +0100
Subject: [PATCH 2/2] [FileCheck] Re-run clang-format
---
llvm/lib/FileCheck/FileCheck.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index e77f82b569c8b..2bfef104c6a9c 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -29,7 +29,7 @@
using namespace llvm;
namespace {
- constexpr int BACKREF_LIMIT = 20;
+constexpr int BACKREF_LIMIT = 20;
} // anonymous namespace
StringRef ExpressionFormat::toString() const {
More information about the llvm-commits
mailing list