[llvm] r288553 - Support escaping in TrigramIndex.
Ivan Krasin via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 2 15:30:16 PST 2016
Author: krasin
Date: Fri Dec 2 17:30:16 2016
New Revision: 288553
URL: http://llvm.org/viewvc/llvm-project?rev=288553&view=rev
Log:
Support escaping in TrigramIndex.
Summary:
This is a follow up to r288303, where I have introduced TrigramIndex
to speed up SpecialCaseList for the cases when all rules are
simple wildcards, like *hello*wor.d*.
Here, I add support for escaping, so that it's possible to
specify rules like *c\+\+abi*.
Reviewers: pcc
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D27318
Modified:
llvm/trunk/lib/Support/TrigramIndex.cpp
llvm/trunk/unittests/Support/SpecialCaseListTest.cpp
llvm/trunk/unittests/Support/TrigramIndexTest.cpp
Modified: llvm/trunk/lib/Support/TrigramIndex.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/TrigramIndex.cpp?rev=288553&r1=288552&r2=288553&view=diff
==============================================================================
--- llvm/trunk/lib/Support/TrigramIndex.cpp (original)
+++ llvm/trunk/lib/Support/TrigramIndex.cpp Fri Dec 2 17:30:16 2016
@@ -26,28 +26,41 @@ using namespace llvm;
static const char RegexAdvancedMetachars[] = "()^$|+?[]\\{}";
-static bool isSimpleWildcard(StringRef Str) {
- // Check for regex metacharacters other than '*' and '.'.
- return Str.find_first_of(RegexAdvancedMetachars) == StringRef::npos;
+static bool isAdvancedMetachar(unsigned Char) {
+ return strchr(RegexAdvancedMetachars, Char) != nullptr;
}
void TrigramIndex::insert(std::string Regex) {
if (Defeated) return;
- if (!isSimpleWildcard(Regex)) {
- Defeated = true;
- return;
- }
-
std::set<unsigned> Was;
unsigned Cnt = 0;
unsigned Tri = 0;
unsigned Len = 0;
+ bool Escaped = false;
for (unsigned Char : Regex) {
- if (Char == '.' || Char == '*') {
- Tri = 0;
- Len = 0;
- continue;
+ if (!Escaped) {
+ // Regular expressions allow escaping symbols by preceding it with '\'.
+ if (Char == '\\') {
+ Escaped = true;
+ continue;
+ }
+ if (isAdvancedMetachar(Char)) {
+ // This is a more complicated regex than we can handle here.
+ Defeated = true;
+ return;
+ }
+ if (Char == '.' || Char == '*') {
+ Tri = 0;
+ Len = 0;
+ continue;
+ }
+ }
+ if (Escaped && Char >= '1' && Char <= '9') {
+ Defeated = true;
+ return;
}
+ // We have already handled escaping and can reset the flag.
+ Escaped = false;
Tri = ((Tri << 8) + Char) & 0xFFFFFF;
Len++;
if (Len < 3)
Modified: llvm/trunk/unittests/Support/SpecialCaseListTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/SpecialCaseListTest.cpp?rev=288553&r1=288552&r2=288553&view=diff
==============================================================================
--- llvm/trunk/unittests/Support/SpecialCaseListTest.cpp (original)
+++ llvm/trunk/unittests/Support/SpecialCaseListTest.cpp Fri Dec 2 17:30:16 2016
@@ -178,4 +178,15 @@ TEST_F(SpecialCaseListTest, PopularTrigr
EXPECT_TRUE(SCL->inSection("fun", "aaaabbbaaa"));
}
+TEST_F(SpecialCaseListTest, EscapedSymbols) {
+ std::unique_ptr<SpecialCaseList> SCL = makeSpecialCaseList("src:*c\\+\\+abi*\n"
+ "src:*hello\\\\world*\n");
+ EXPECT_TRUE(SCL->inSection("src", "dir/c++abi"));
+ EXPECT_FALSE(SCL->inSection("src", "dir/c\\+\\+abi"));
+ EXPECT_FALSE(SCL->inSection("src", "c\\+\\+abi"));
+ EXPECT_TRUE(SCL->inSection("src", "C:\\hello\\world"));
+ EXPECT_TRUE(SCL->inSection("src", "hello\\world"));
+ EXPECT_FALSE(SCL->inSection("src", "hello\\\\world"));
+}
+
}
Modified: llvm/trunk/unittests/Support/TrigramIndexTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/TrigramIndexTest.cpp?rev=288553&r1=288552&r2=288553&view=diff
==============================================================================
--- llvm/trunk/unittests/Support/TrigramIndexTest.cpp (original)
+++ llvm/trunk/unittests/Support/TrigramIndexTest.cpp Fri Dec 2 17:30:16 2016
@@ -94,9 +94,29 @@ TEST_F(TrigramIndexTest, TooComplicatedR
EXPECT_TRUE(TI->isDefeated());
}
-TEST_F(TrigramIndexTest, SpecialSymbol) {
+TEST_F(TrigramIndexTest, EscapedSymbols) {
std::unique_ptr<TrigramIndex> TI =
- makeTrigramIndex({"*c\\+\\+*"});
+ makeTrigramIndex({"*c\\+\\+*", "*hello\\\\world*", "a\\tb", "a\\0b"});
+ EXPECT_FALSE(TI->isDefeated());
+ EXPECT_FALSE(TI->isDefinitelyOut("c++"));
+ EXPECT_TRUE(TI->isDefinitelyOut("c\\+\\+"));
+ EXPECT_FALSE(TI->isDefinitelyOut("hello\\world"));
+ EXPECT_TRUE(TI->isDefinitelyOut("hello\\\\world"));
+ EXPECT_FALSE(TI->isDefinitelyOut("atb"));
+ EXPECT_TRUE(TI->isDefinitelyOut("a\\tb"));
+ EXPECT_TRUE(TI->isDefinitelyOut("a\tb"));
+ EXPECT_FALSE(TI->isDefinitelyOut("a0b"));
+}
+
+TEST_F(TrigramIndexTest, Backreference1) {
+ std::unique_ptr<TrigramIndex> TI =
+ makeTrigramIndex({"*foo\\1*"});
+ EXPECT_TRUE(TI->isDefeated());
+}
+
+TEST_F(TrigramIndexTest, Backreference2) {
+ std::unique_ptr<TrigramIndex> TI =
+ makeTrigramIndex({"*foo\\2*"});
EXPECT_TRUE(TI->isDefeated());
}
More information about the llvm-commits
mailing list