[llvm] [TableGen] Add `!match` operator to do regex matching (PR #130759)
Pengcheng Wang via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 11 05:06:44 PDT 2025
https://github.com/wangpc-pp created https://github.com/llvm/llvm-project/pull/130759
The grammar is `!match(str, regex)` and this operator produces 1
if the `str` matches the regular expression `regex`.
The format of `regex` is ERE (Extended POSIX Regular Expressions).
>From cccf502aedc25727fbf0804187adb532e7697be7 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp at bytedance.com>
Date: Tue, 11 Mar 2025 19:59:20 +0800
Subject: [PATCH] [TableGen] Add `!match` operator to do regex matching
The grammar is `!match(str, regex)` and this operator produces 1
if the `str` matches the regular expression `regex`.
The format of `regex` is ERE (Extended POSIX Regular Expressions).
---
llvm/docs/TableGen/ProgRef.rst | 14 ++++---
llvm/include/llvm/TableGen/Record.h | 33 +++++++++++++++++
llvm/lib/TableGen/Record.cpp | 57 +++++++++++++++++++++++++++++
llvm/lib/TableGen/TGLexer.cpp | 1 +
llvm/lib/TableGen/TGLexer.h | 1 +
llvm/lib/TableGen/TGParser.cpp | 43 ++++++++++++++++++++++
llvm/test/TableGen/match.td | 30 +++++++++++++++
7 files changed, 174 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/TableGen/match.td
diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst
index edb97109c9289..0983c6283f7e2 100644
--- a/llvm/docs/TableGen/ProgRef.rst
+++ b/llvm/docs/TableGen/ProgRef.rst
@@ -225,11 +225,11 @@ TableGen provides "bang operators" that have a wide variety of uses:
: !getdagname !getdagop !gt !head !if
: !initialized !interleave !isa !le !listconcat
: !listflatten !listremove !listsplat !logtwo !lt
- : !mul !ne !not !or !range
- : !repr !setdagarg !setdagname !setdagop !shl
- : !size !sra !srl !strconcat !sub
- : !subst !substr !tail !tolower !toupper
- : !xor
+ : !match !mul !ne !not !or
+ : !range !repr !setdagarg !setdagname !setdagop
+ : !shl !size !sra !srl !strconcat
+ : !sub !subst !substr !tail !tolower
+ : !toupper !xor
The ``!cond`` operator has a slightly different
syntax compared to other bang operators, so it is defined separately:
@@ -1878,6 +1878,10 @@ and non-0 as true.
This operator produces 1 if *a* is less than *b*; 0 otherwise.
The arguments must be ``bit``, ``bits``, ``int``, or ``string`` values.
+``!match(``\ *str*\ `,` *regex*\ ``)``
+ This operator produces 1 if the *str* matches the regular expression
+ *regex*. The format of *regex* is ERE (Extended POSIX Regular Expressions).
+
``!mul(``\ *a*\ ``,`` *b*\ ``, ...)``
This operator multiplies *a*, *b*, etc., and produces the product.
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 334007524c954..98d5947a75e33 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -316,6 +316,7 @@ class Init {
IK_FoldOpInit,
IK_IsAOpInit,
IK_ExistsOpInit,
+ IK_MatchOpInit,
IK_AnonymousNameInit,
IK_StringInit,
IK_VarInit,
@@ -1191,6 +1192,38 @@ class ExistsOpInit final : public TypedInit, public FoldingSetNode {
std::string getAsString() const override;
};
+/// !match(str, regex) - This operator produces 1 if the `str` matches the
+/// regular expression `regex`.
+class MatchOpInit final : public TypedInit, public FoldingSetNode {
+private:
+ const Init *Str;
+ const Init *Regex;
+
+ MatchOpInit(const Init *Str, const Init *Regex)
+ : TypedInit(IK_MatchOpInit, BitRecTy::get(Str->getRecordKeeper())),
+ Str(Str), Regex(Regex) {}
+
+public:
+ MatchOpInit(const MatchOpInit &) = delete;
+ MatchOpInit &operator=(const MatchOpInit &) = delete;
+
+ static bool classof(const Init *I) { return I->getKind() == IK_MatchOpInit; }
+
+ static const MatchOpInit *get(const Init *Str, const Init *Regex);
+
+ void Profile(FoldingSetNodeID &ID) const;
+
+ const Init *Fold() const;
+
+ bool isComplete() const override { return false; }
+
+ const Init *resolveReferences(Resolver &R) const override;
+
+ const Init *getBit(unsigned Bit) const override;
+
+ std::string getAsString() const override;
+};
+
/// 'Opcode' - Represent a reference to an entire variable object.
class VarInit final : public TypedInit {
const Init *VarName;
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 590656786bc66..6dcf19f310914 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -25,6 +25,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Regex.h"
#include "llvm/Support/SMLoc.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/TableGen/Error.h"
@@ -83,6 +84,7 @@ struct RecordKeeperImpl {
FoldingSet<FoldOpInit> TheFoldOpInitPool;
FoldingSet<IsAOpInit> TheIsAOpInitPool;
FoldingSet<ExistsOpInit> TheExistsOpInitPool;
+ FoldingSet<MatchOpInit> TheMatchOpInitPool;
DenseMap<std::pair<const RecTy *, const Init *>, VarInit *> TheVarInitPool;
DenseMap<std::pair<const TypedInit *, unsigned>, VarBitInit *>
TheVarBitInitPool;
@@ -2199,6 +2201,61 @@ std::string ExistsOpInit::getAsString() const {
.str();
}
+static void ProfileMatchOpInit(FoldingSetNodeID &ID, const Init *Str,
+ const Init *Regex) {
+ ID.AddPointer(Str);
+ ID.AddPointer(Regex);
+}
+
+const MatchOpInit *MatchOpInit::get(const Init *Str, const Init *Regex) {
+ FoldingSetNodeID ID;
+ ProfileMatchOpInit(ID, Str, Regex);
+
+ detail::RecordKeeperImpl &RK = Regex->getRecordKeeper().getImpl();
+ void *IP = nullptr;
+ if (const MatchOpInit *I = RK.TheMatchOpInitPool.FindNodeOrInsertPos(ID, IP))
+ return I;
+
+ MatchOpInit *I = new (RK.Allocator) MatchOpInit(Str, Regex);
+ RK.TheMatchOpInitPool.InsertNode(I, IP);
+ return I;
+}
+
+void MatchOpInit::Profile(FoldingSetNodeID &ID) const {
+ ProfileMatchOpInit(ID, Str, Regex);
+}
+
+const Init *MatchOpInit::Fold() const {
+ const auto *StrInit = dyn_cast<StringInit>(Str);
+ const auto *RegexInit = dyn_cast<StringInit>(Regex);
+ if (!(StrInit && RegexInit))
+ return this;
+
+ StringRef RegexStr = RegexInit->getValue();
+ llvm::Regex Matcher(RegexStr);
+ if (!Matcher.isValid())
+ PrintFatalError(Twine("invalid regex '") + RegexStr + Twine("'"));
+
+ return BitInit::get(Str->getRecordKeeper(),
+ Matcher.match(StrInit->getValue()));
+}
+
+const Init *MatchOpInit::resolveReferences(Resolver &R) const {
+ const Init *NewStr = Str->resolveReferences(R);
+ const Init *NewRegex = Regex->resolveReferences(R);
+ if (Str != NewStr || Regex != NewRegex)
+ return get(NewStr, NewRegex)->Fold();
+ return this;
+}
+
+const Init *MatchOpInit::getBit(unsigned Bit) const {
+ return VarBitInit::get(this, Bit);
+}
+
+std::string MatchOpInit::getAsString() const {
+ return "!match(" + Str->getAsString() + ", " + Regex->getAsString() + ")";
+}
+
const RecTy *TypedInit::getFieldType(const StringInit *FieldName) const {
if (const auto *RecordType = dyn_cast<RecordRecTy>(getType())) {
for (const Record *Rec : RecordType->getClasses()) {
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 983242ade0fe5..0b2f927446b1e 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -644,6 +644,7 @@ tgtok::TokKind TGLexer::LexExclaim() {
.Case("tolower", tgtok::XToLower)
.Case("toupper", tgtok::XToUpper)
.Case("repr", tgtok::XRepr)
+ .Case("match", tgtok::XMatch)
.Default(tgtok::Error);
return Kind != tgtok::Error ? Kind
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 6680915211205..ef9205197decf 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -126,6 +126,7 @@ enum TokKind {
XInterleave,
XSubstr,
XFind,
+ XMatch,
XCast,
XSubst,
XForEach,
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 9a8301cffb930..9324760106657 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -1455,6 +1455,49 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
return (ExistsOpInit::get(Type, Expr))->Fold(CurRec);
}
+ case tgtok::XMatch: {
+ // Value ::= !match '(' Str ',' Regex ')'
+ Lex.Lex(); // eat the operation.
+
+ if (!consume(tgtok::l_paren)) {
+ TokError("expected '(' after !match");
+ return nullptr;
+ }
+
+ SMLoc StrLoc = Lex.getLoc();
+ const Init *Str = ParseValue(CurRec);
+ if (!Str)
+ return nullptr;
+
+ const auto *StrType = dyn_cast<TypedInit>(Str);
+ if (!StrType || !isa<StringRecTy>(StrType->getType())) {
+ Error(StrLoc, "expected string type argument in !match operator");
+ return nullptr;
+ }
+
+ // eat the comma.
+ if (!consume(tgtok::comma))
+ return nullptr;
+
+ SMLoc RegexLoc = Lex.getLoc();
+ const Init *Regex = ParseValue(CurRec);
+ if (!Regex)
+ return nullptr;
+
+ const auto *RegexType = dyn_cast<TypedInit>(Regex);
+ if (!RegexType || !isa<StringRecTy>(RegexType->getType())) {
+ Error(RegexLoc, "expected string type argument in !match operator");
+ return nullptr;
+ }
+
+ if (!consume(tgtok::r_paren)) {
+ TokError("expected ')' in !match");
+ return nullptr;
+ }
+
+ return MatchOpInit::get(Str, Regex)->Fold();
+ }
+
case tgtok::XConcat:
case tgtok::XADD:
case tgtok::XSUB:
diff --git a/llvm/test/TableGen/match.td b/llvm/test/TableGen/match.td
new file mode 100644
index 0000000000000..feb8987b545c1
--- /dev/null
+++ b/llvm/test/TableGen/match.td
@@ -0,0 +1,30 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// RUN: not llvm-tblgen -DERROR1 %s 2>&1 | FileCheck --check-prefix=ERROR1 %s
+// RUN: not llvm-tblgen -DERROR2 %s 2>&1 | FileCheck --check-prefix=ERROR2 %s
+// RUN: not llvm-tblgen -DERROR3 %s 2>&1 | FileCheck --check-prefix=ERROR3 %s
+// XFAIL: vg_leak
+
+def test {
+ bit test0 = !match("123 abc ABC", "[0-9 a-z A-Z]+");
+ bit test1 = !match("abc", "[0-9]+");
+}
+
+// CHECK-LABEL: def test {
+// CHECK-NEXT: bit test0 = 1;
+// CHECK-NEXT: bit test1 = 0;
+// CHECK-NEXT: }
+
+#ifdef ERROR1
+defvar error1 = !match(123, ".*");
+// ERROR1: error: expected string type argument in !match operator
+#endif
+
+#ifdef ERROR2
+defvar error2 = !match("abc", 123);
+// ERROR2: error: expected string type argument in !match operator
+#endif
+
+#ifdef ERROR3
+defvar error3 = !match("abc", "([)]");
+// ERROR3: error: invalid regex '([)]'
+#endif
More information about the llvm-commits
mailing list