[llvm-commits] [llvm] r168802 - in /llvm/trunk: include/llvm/Support/Regex.h lib/Support/Regex.cpp lib/Support/regcomp.c unittests/Support/RegexTest.cpp

Eli Bendersky eliben at google.com
Wed Nov 28 11:00:02 PST 2012


Author: eliben
Date: Wed Nov 28 13:00:02 2012
New Revision: 168802

URL: http://llvm.org/viewvc/llvm-project?rev=168802&view=rev
Log:
Add backreference matching capabilities to Support/Regex, with
appropriate unit tests. This change in itself is not expected to
affect any functionality at this point, but it will serve as a
stepping stone to improve FileCheck's variable matching capabilities.

Luckily, our regex implementation already supports backreferences,
although a bit of hacking is required to enable it. It supports both
Basic Regular Expressions (BREs) and Extended Regular Expressions
(EREs), without supporting backrefs for EREs, following POSIX strictly
in this respect. And EREs is what we actually use (rightly). This is
contrary to many implementations (including the default on Linux) of
POSIX regexes, that do allow backrefs in EREs.

Adding backref support to our EREs is a very simple change in the
regcomp parsing code. I fail to think of significant cases where it
would clash with existing things, and can bring more versatility to
the regexes we write. There's always the danger of a backref in a
specially crafted regex causing exponential matching times, but since
we mainly use them for testing purposes I don't think it's a big
problem. [it can also be placed behind a flag specific to FileCheck,
if needed].

For more details, see:

* http://lists.cs.uiuc.edu/pipermail/llvmdev/2012-November/055840.html
* http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20121126/156878.html


Modified:
    llvm/trunk/include/llvm/Support/Regex.h
    llvm/trunk/lib/Support/Regex.cpp
    llvm/trunk/lib/Support/regcomp.c
    llvm/trunk/unittests/Support/RegexTest.cpp

Modified: llvm/trunk/include/llvm/Support/Regex.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/Regex.h?rev=168802&r1=168801&r2=168802&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Support/Regex.h (original)
+++ llvm/trunk/include/llvm/Support/Regex.h Wed Nov 28 13:00:02 2012
@@ -7,7 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a POSIX regular expression matcher.
+// This file implements a POSIX regular expression matcher.  Both Basic and
+// Extended POSIX regular expressions (ERE) are supported.  EREs were extended
+// to support backreferences in matches.
+// This implementation also supports matching strings with embedded NUL chars.
 //
 //===----------------------------------------------------------------------===//
 
@@ -33,12 +36,14 @@
       /// null string after any newline in the string in addition to its normal
       /// function, and the $ anchor matches the null string before any
       /// newline in the string in addition to its normal function.
-      Newline=2
+      Newline=2,
+      /// By default, the POSIX extended regular expression (ERE) syntax is
+      /// assumed. Pass this flag to turn on basic regular expressions (BRE)
+      /// instead.
+      BasicRegex=4
     };
 
-    /// Compiles the given POSIX Extended Regular Expression \p Regex.
-    /// This implementation supports regexes and matching strings with embedded
-    /// NUL characters.
+    /// Compiles the given regular expression \p Regex.
     Regex(StringRef Regex, unsigned Flags = NoFlags);
     ~Regex();
 

Modified: llvm/trunk/lib/Support/Regex.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/Regex.cpp?rev=168802&r1=168801&r2=168802&view=diff
==============================================================================
--- llvm/trunk/lib/Support/Regex.cpp (original)
+++ llvm/trunk/lib/Support/Regex.cpp Wed Nov 28 13:00:02 2012
@@ -27,7 +27,9 @@
     flags |= REG_ICASE;
   if (Flags & Newline)
     flags |= REG_NEWLINE;
-  error = llvm_regcomp(preg, regex.data(), flags|REG_EXTENDED|REG_PEND);
+  if (!(Flags & BasicRegex))
+    flags |= REG_EXTENDED;
+  error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
 }
 
 Regex::~Regex() {

Modified: llvm/trunk/lib/Support/regcomp.c
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/regcomp.c?rev=168802&r1=168801&r2=168802&view=diff
==============================================================================
--- llvm/trunk/lib/Support/regcomp.c (original)
+++ llvm/trunk/lib/Support/regcomp.c Wed Nov 28 13:00:02 2012
@@ -303,6 +303,7 @@
 	sopno pos;
 	int count;
 	int count2;
+	int backrefnum;
 	sopno subno;
 	int wascaret = 0;
 
@@ -370,7 +371,34 @@
 	case '\\':
 		REQUIRE(MORE(), REG_EESCAPE);
 		c = GETNEXT();
-		ordinary(p, c);
+		if (c >= '1' && c <= '9') {
+			/* \[0-9] is taken to be a back-reference to a previously specified
+			 * matching group. backrefnum will hold the number. The matching
+			 * group must exist (i.e. if \4 is found there must have been at
+			 * least 4 matching groups specified in the pattern previously).
+			 */
+			backrefnum = c - '0';
+			if (p->pend[backrefnum] == 0) {
+				SETERROR(REG_ESUBREG);
+				break;
+			}
+
+			/* Make sure everything checks out and emit the sequence
+			 * that marks a back-reference to the parse structure.
+			 */
+			assert(backrefnum <= p->g->nsub);
+			EMIT(OBACK_, backrefnum);
+			assert(p->pbegin[backrefnum] != 0);
+			assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN);
+			assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN);
+			(void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]);
+			EMIT(O_BACK, backrefnum);
+			p->g->backrefs = 1;
+		} else {
+			/* Other chars are simply themselves when escaped with a backslash.
+			 */
+			ordinary(p, c);
+		}
 		break;
 	case '{':		/* okay as ordinary except if digit follows */
 		REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);

Modified: llvm/trunk/unittests/Support/RegexTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/RegexTest.cpp?rev=168802&r1=168801&r2=168802&view=diff
==============================================================================
--- llvm/trunk/unittests/Support/RegexTest.cpp (original)
+++ llvm/trunk/unittests/Support/RegexTest.cpp Wed Nov 28 13:00:02 2012
@@ -51,7 +51,6 @@
   EXPECT_EQ(1u, Matches.size());
   EXPECT_EQ(String, Matches[0].str());
 
-
   std::string NulPattern="X[0-9]+X([a-f])?:([0-9]+)";
   String="YX99a:513b";
   NulPattern[7] = '\0';
@@ -62,6 +61,28 @@
   EXPECT_TRUE(r5.match(String));
 }
 
+TEST_F(RegexTest, Backreferences) {
+  Regex r1("([a-z]+)_\\1");
+  SmallVector<StringRef, 4> Matches;
+  EXPECT_TRUE(r1.match("abc_abc", &Matches));
+  EXPECT_EQ(2u, Matches.size());
+  EXPECT_FALSE(r1.match("abc_ab", &Matches));
+
+  Regex r2("a([0-9])b\\1c\\1");
+  EXPECT_TRUE(r2.match("a4b4c4", &Matches));
+  EXPECT_EQ(2u, Matches.size());
+  EXPECT_EQ("4", Matches[1].str());
+  EXPECT_FALSE(r2.match("a2b2c3"));
+
+  Regex r3("a([0-9])([a-z])b\\1\\2");
+  EXPECT_TRUE(r3.match("a6zb6z", &Matches));
+  EXPECT_EQ(3u, Matches.size());
+  EXPECT_EQ("6", Matches[1].str());
+  EXPECT_EQ("z", Matches[2].str());
+  EXPECT_FALSE(r3.match("a6zb6y"));
+  EXPECT_FALSE(r3.match("a6zb7z"));
+}
+
 TEST_F(RegexTest, Substitution) {
   std::string Error;
 





More information about the llvm-commits mailing list