[lld] r285949 - Add [<chars>] to the glob matcher to eliminate use of llvm::Regex.

Rui Ueyama via llvm-commits llvm-commits at lists.llvm.org
Thu Nov 3 10:57:38 PDT 2016


Author: ruiu
Date: Thu Nov  3 12:57:38 2016
New Revision: 285949

URL: http://llvm.org/viewvc/llvm-project?rev=285949&view=rev
Log:
Add [<chars>] to the glob matcher to eliminate use of llvm::Regex.

Previously, it didn't support the character class, so we couldn't
eliminate the use fo llvm::Regex. Now that it is supported, we
can remove compileGlobPattern, which converts a glob pattern to
a regex.

This patch contains optimization for exact/prefix/suffix matches.

Differential Revision: https://reviews.llvm.org/D26284

Modified:
    lld/trunk/ELF/LinkerScript.cpp
    lld/trunk/ELF/LinkerScript.h
    lld/trunk/ELF/Strings.cpp
    lld/trunk/ELF/Strings.h
    lld/trunk/ELF/SymbolTable.cpp
    lld/trunk/ELF/SymbolTable.h

Modified: lld/trunk/ELF/LinkerScript.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/LinkerScript.cpp?rev=285949&r1=285948&r2=285949&view=diff
==============================================================================
--- lld/trunk/ELF/LinkerScript.cpp (original)
+++ lld/trunk/ELF/LinkerScript.cpp Thu Nov  3 12:57:38 2016
@@ -111,11 +111,11 @@ template <class ELFT>
 bool LinkerScript<ELFT>::shouldKeep(InputSectionBase<ELFT> *S) {
   for (InputSectionDescription *ID : Opt.KeptSections) {
     StringRef Filename = S->getFile()->getName();
-    if (!ID->FileRe.match(sys::path::filename(Filename)))
+    if (!ID->FilePat.match(sys::path::filename(Filename)))
       continue;
 
     for (SectionPattern &P : ID->SectionPatterns)
-      if (P.SectionRe.match(S->Name))
+      if (P.SectionPat.match(S->Name))
         return true;
   }
   return false;
@@ -178,13 +178,13 @@ void LinkerScript<ELFT>::computeInputSec
     size_t SizeBefore = I->Sections.size();
     for (ObjectFile<ELFT> *F : Symtab<ELFT>::X->getObjectFiles()) {
       StringRef Filename = sys::path::filename(F->getName());
-      if (!I->FileRe.match(Filename) || Pat.ExcludedFileRe.match(Filename))
+      if (!I->FilePat.match(Filename) || Pat.ExcludedFilePat.match(Filename))
         continue;
 
       for (InputSectionBase<ELFT> *S : F->getSections())
-        if (!isDiscarded(S) && !S->OutSec && Pat.SectionRe.match(S->Name))
+        if (!isDiscarded(S) && !S->OutSec && Pat.SectionPat.match(S->Name))
           I->Sections.push_back(S);
-      if (Pat.SectionRe.match("COMMON"))
+      if (Pat.SectionPat.match("COMMON"))
         I->Sections.push_back(InputSection<ELFT>::CommonInputSection);
     }
 
@@ -1211,7 +1211,7 @@ StringMatcher ScriptParser::readFilePatt
   std::vector<StringRef> V;
   while (!Error && !consume(")"))
     V.push_back(next());
-  return StringMatcher(std::move(V));
+  return StringMatcher(V);
 }
 
 SortSectionPolicy ScriptParser::readSortKind() {
@@ -1236,10 +1236,10 @@ SortSectionPolicy ScriptParser::readSort
 std::vector<SectionPattern> ScriptParser::readInputSectionsList() {
   std::vector<SectionPattern> Ret;
   while (!Error && peek() != ")") {
-    StringMatcher ExcludeFileRe;
+    StringMatcher ExcludeFilePat;
     if (consume("EXCLUDE_FILE")) {
       expect("(");
-      ExcludeFileRe = readFilePatterns();
+      ExcludeFilePat = readFilePatterns();
     }
 
     std::vector<StringRef> V;
@@ -1247,7 +1247,7 @@ std::vector<SectionPattern> ScriptParser
       V.push_back(next());
 
     if (!V.empty())
-      Ret.push_back({std::move(ExcludeFileRe), StringMatcher(std::move(V))});
+      Ret.push_back({std::move(ExcludeFilePat), StringMatcher(V)});
     else
       setError("section pattern is expected");
   }

Modified: lld/trunk/ELF/LinkerScript.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/LinkerScript.h?rev=285949&r1=285948&r2=285949&view=diff
==============================================================================
--- lld/trunk/ELF/LinkerScript.h (original)
+++ lld/trunk/ELF/LinkerScript.h Thu Nov  3 12:57:38 2016
@@ -114,28 +114,20 @@ struct OutputSectionCommand : BaseComman
 // It can optionally have negative match pattern for EXCLUDED_FILE command.
 // Also it may be surrounded with SORT() command, so contains sorting rules.
 struct SectionPattern {
-  SectionPattern(StringMatcher &&Re1, StringMatcher &&Re2)
-      : ExcludedFileRe(std::forward<StringMatcher>(Re1)),
-        SectionRe(std::forward<StringMatcher>(Re2)) {}
+  SectionPattern(StringMatcher &&Pat1, StringMatcher &&Pat2)
+      : ExcludedFilePat(Pat1), SectionPat(Pat2) {}
 
-  SectionPattern(SectionPattern &&Other) {
-    std::swap(ExcludedFileRe, Other.ExcludedFileRe);
-    std::swap(SectionRe, Other.SectionRe);
-    std::swap(SortOuter, Other.SortOuter);
-    std::swap(SortInner, Other.SortInner);
-  }
-
-  StringMatcher ExcludedFileRe;
-  StringMatcher SectionRe;
+  StringMatcher ExcludedFilePat;
+  StringMatcher SectionPat;
   SortSectionPolicy SortOuter;
   SortSectionPolicy SortInner;
 };
 
 struct InputSectionDescription : BaseCommand {
   InputSectionDescription(StringRef FilePattern)
-      : BaseCommand(InputSectionKind), FileRe(FilePattern) {}
+      : BaseCommand(InputSectionKind), FilePat({FilePattern}) {}
   static bool classof(const BaseCommand *C);
-  StringMatcher FileRe;
+  StringMatcher FilePat;
 
   // Input sections that matches at least one of SectionPatterns
   // will be associated with this InputSectionDescription.

Modified: lld/trunk/ELF/Strings.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/Strings.cpp?rev=285949&r1=285948&r2=285949&view=diff
==============================================================================
--- lld/trunk/ELF/Strings.cpp (original)
+++ lld/trunk/ELF/Strings.cpp Thu Nov  3 12:57:38 2016
@@ -21,36 +21,142 @@ using namespace llvm;
 using namespace lld;
 using namespace lld::elf;
 
-// Returns true if S matches T. S can contain glob meta-characters.
-// The asterisk ('*') matches zero or more characters, and the question
-// mark ('?') matches one character.
-static bool globMatch(StringRef S, StringRef T) {
+// This is a scanner for the glob pattern.
+// A glob pattern token is one of "*", "?", "[<chars>]", "[^<chars>]"
+// (which is a negative form of "[<chars>]"), or a non-meta character.
+// This function returns the first token in S.
+BitVector GlobPattern::scan(StringRef &S) {
+  switch (S[0]) {
+  case '*':
+    S = S.substr(1);
+    // '*' is represented by an empty bitvector.
+    // All other bitvectors are 256-bit long.
+    return BitVector();
+  case '?':
+    S = S.substr(1);
+    return BitVector(256, true);
+  case '[': {
+    size_t End = S.find(']', 1);
+    if (End == StringRef::npos) {
+      error("invalid glob pattern: " + Original);
+      return BitVector(256, false);
+    }
+    StringRef Chars = S.substr(1, End - 1);
+    S = S.substr(End + 1);
+    if (Chars.startswith("^"))
+      return expand(Chars.substr(1)).flip();
+    return expand(Chars);
+  }
+  default:
+    BitVector BV(256, false);
+    BV[S[0]] = true;
+    S = S.substr(1);
+    return BV;
+  }
+}
+
+// Expands character ranges and returns a bitmap.
+// For example, "a-cf-hz" is expanded to "abcfghz".
+BitVector GlobPattern::expand(StringRef S) {
+  BitVector BV(256, false);
+
+  // Expand "x-y".
   for (;;) {
-    if (S.empty())
-      return T.empty();
-    if (S[0] == '*') {
+    if (S.size() < 3)
+      break;
+
+    // If it doesn't start with something like "x-y",
+    // consume the first character and proceed.
+    if (S[1] != '-') {
+      BV[S[0]] = true;
       S = S.substr(1);
-      if (S.empty())
+      continue;
+    }
+
+    // It must be in the form of "x-y".
+    // Validate it and then interpret the range.
+    if (S[0] > S[2]) {
+      error("invalid glob pattern: " + Original);
+      return BV;
+    }
+    for (int C = S[0]; C <= S[2]; ++C)
+      BV[C] = true;
+    S = S.substr(3);
+  }
+
+  for (char C : S)
+    BV[C] = true;
+  return BV;
+}
+
+GlobPattern::GlobPattern(StringRef S) : Original(S) {
+  if (!hasWildcard(S)) {
+    // S doesn't contain any metacharacter,
+    // so the regular string comparison should work.
+    Exact = S;
+  } else if (S.endswith("*") && !hasWildcard(S.drop_back())) {
+    // S is something like "foo*". We can use startswith().
+    Prefix = S.drop_back();
+  } else if (S.startswith("*") && !hasWildcard(S.drop_front())) {
+    // S is something like "*foo". We can use endswith().
+    Suffix = S.drop_front();
+  } else {
+    // Otherwise, we need to do real glob pattern matching.
+    // Parse the pattern now.
+    while (!S.empty())
+      Tokens.push_back(scan(S));
+  }
+}
+
+bool GlobPattern::match(StringRef S) const {
+  if (Exact)
+    return S == *Exact;
+  if (Prefix)
+    return S.startswith(*Prefix);
+  if (Suffix)
+    return S.endswith(*Suffix);
+  return matchOne(Tokens, S);
+}
+
+// Runs glob pattern Pats against string S.
+bool GlobPattern::matchOne(ArrayRef<BitVector> Pats, StringRef S) const {
+  for (;;) {
+    if (Pats.empty())
+      return S.empty();
+
+    // If Pats[0] is '*', try to match Pats[1..] against all possible
+    // substrings of S to see at least one pattern succeeds.
+    if (Pats[0].size() == 0) {
+      Pats = Pats.slice(1);
+      if (Pats.empty())
         // Fast path. If a pattern is '*', it matches anything.
         return true;
-      for (size_t I = 0, E = T.size(); I < E; ++I)
-        if (globMatch(S, T.substr(I)))
+      for (size_t I = 0, E = S.size(); I < E; ++I)
+        if (matchOne(Pats, S.substr(I)))
           return true;
       return false;
     }
-    if (T.empty() || (S[0] != T[0] && S[0] != '?'))
+
+    // If Pats[0] is not '*', it must consume one character.
+    if (S.empty() || !Pats[0][S[0]])
       return false;
+    Pats = Pats.slice(1);
     S = S.substr(1);
-    T = T.substr(1);
   }
 }
 
-bool StringMatcher::match(StringRef S) {
-  for (StringRef P : Patterns)
-    if (globMatch(P, S))
+StringMatcher::StringMatcher(const std::vector<StringRef> &Pat) {
+  for (StringRef S : Pat)
+    Patterns.push_back(GlobPattern(S));
+}
+
+bool StringMatcher::match(StringRef S) const {
+  for (const GlobPattern &Pat : Patterns)
+    if (Pat.match(S))
       return true;
   return false;
 }
+
 // If an input string is in the form of "foo.N" where N is a number,
 // return N. Otherwise, returns 65536, which is one greater than the
 // lowest priority.
@@ -74,42 +180,6 @@ StringRef elf::unquote(StringRef S) {
   return S.substr(1, S.size() - 2);
 }
 
-// Converts a glob pattern to a regular expression.
-static std::string toRegex(StringRef S) {
-  std::string T;
-  bool InBracket = false;
-  while (!S.empty()) {
-    char C = S.front();
-    if (InBracket) {
-      InBracket = C != ']';
-      T += C;
-      S = S.drop_front();
-      continue;
-    }
-
-    if (C == '*')
-      T += ".*";
-    else if (C == '?')
-      T += '.';
-    else if (StringRef(".+^${}()|/\\").find_first_of(C) != StringRef::npos)
-      T += std::string("\\") + C;
-    else
-      T += C;
-
-    InBracket = C == '[';
-    S = S.substr(1);
-  }
-  return T;
-}
-
-// Converts multiple glob patterns to a regular expression.
-Regex elf::compileGlobPatterns(ArrayRef<StringRef> V) {
-  std::string T = "^(" + toRegex(V[0]);
-  for (StringRef S : V.slice(1))
-    T += "|" + toRegex(S);
-  return Regex(T + ")$");
-}
-
 // Converts a hex string (e.g. "deadbeef") to a vector.
 std::vector<uint8_t> elf::parseHex(StringRef S) {
   std::vector<uint8_t> Hex;

Modified: lld/trunk/ELF/Strings.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/Strings.h?rev=285949&r1=285948&r2=285949&view=diff
==============================================================================
--- lld/trunk/ELF/Strings.h (original)
+++ lld/trunk/ELF/Strings.h Thu Nov  3 12:57:38 2016
@@ -12,29 +12,54 @@
 
 #include "lld/Core/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Regex.h"
 #include <vector>
 
 namespace lld {
 namespace elf {
-llvm::Regex compileGlobPatterns(ArrayRef<StringRef> V);
+
 int getPriority(StringRef S);
 bool hasWildcard(StringRef S);
 std::vector<uint8_t> parseHex(StringRef S);
 bool isValidCIdentifier(StringRef S);
 StringRef unquote(StringRef S);
 
+// This class represents a glob pattern. Supported metacharacters
+// are "*", "?", "[<chars>]" and "[^<chars>]".
+class GlobPattern {
+public:
+  explicit GlobPattern(StringRef Pat);
+  bool match(StringRef S) const;
+
+private:
+  bool matchOne(ArrayRef<llvm::BitVector> Pat, StringRef S) const;
+  llvm::BitVector scan(StringRef &S);
+  llvm::BitVector expand(StringRef S);
+
+  // Parsed glob pattern.
+  std::vector<llvm::BitVector> Tokens;
+
+  // A glob pattern given to this class. This is for error reporting.
+  StringRef Original;
+
+  // The following members are for optimization.
+  llvm::Optional<StringRef> Exact;
+  llvm::Optional<StringRef> Prefix;
+  llvm::Optional<StringRef> Suffix;
+};
+
+// This class represents multiple glob patterns.
 class StringMatcher {
 public:
   StringMatcher() = default;
-  explicit StringMatcher(StringRef P) : Patterns({P}) {}
-  explicit StringMatcher(std::vector<StringRef> &&Pat)
-      : Patterns(std::move(Pat)) {}
+  explicit StringMatcher(const std::vector<StringRef> &Pat);
+
+  bool match(StringRef S) const;
 
-  bool match(StringRef S);
 private:
-  std::vector<StringRef> Patterns;
+  std::vector<GlobPattern> Patterns;
 };
 
 // Returns a demangled C++ symbol name. If Name is not a mangled

Modified: lld/trunk/ELF/SymbolTable.cpp
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/SymbolTable.cpp?rev=285949&r1=285948&r2=285949&view=diff
==============================================================================
--- lld/trunk/ELF/SymbolTable.cpp (original)
+++ lld/trunk/ELF/SymbolTable.cpp Thu Nov  3 12:57:38 2016
@@ -470,12 +470,12 @@ template <class ELFT> SymbolBody *Symbol
 
 // Returns a list of defined symbols that match with a given regex.
 template <class ELFT>
-std::vector<SymbolBody *> SymbolTable<ELFT>::findAll(const Regex &Re) {
+std::vector<SymbolBody *> SymbolTable<ELFT>::findAll(const StringMatcher &M) {
   std::vector<SymbolBody *> Res;
   for (Symbol *Sym : SymVector) {
     SymbolBody *B = Sym->body();
     StringRef Name = B->getName();
-    if (!B->isUndefined() && const_cast<Regex &>(Re).match(Name))
+    if (!B->isUndefined() && M.match(Name))
       Res.push_back(B);
   }
   return Res;
@@ -611,10 +611,10 @@ findDemangled(std::map<std::string, std:
 
 static std::vector<SymbolBody *>
 findAllDemangled(const std::map<std::string, std::vector<SymbolBody *>> &D,
-                 const Regex &Re) {
+                 StringMatcher &M) {
   std::vector<SymbolBody *> Res;
   for (auto &P : D) {
-    if (const_cast<Regex &>(Re).match(P.first))
+    if (M.match(P.first))
       for (SymbolBody *Body : P.second)
         if (!Body->isUndefined())
           Res.push_back(Body);
@@ -639,8 +639,8 @@ template <class ELFT> void SymbolTable<E
   }
   if (Patterns.empty())
     return;
-  Regex Re = compileGlobPatterns(Patterns);
-  std::vector<SymbolBody *> Syms = findAll(Re);
+  StringMatcher M(Patterns);
+  std::vector<SymbolBody *> Syms = findAll(M);
   for (SymbolBody *B : Syms)
     B->symbol()->VersionId = VER_NDX_GLOBAL;
 }
@@ -696,9 +696,9 @@ template <class ELFT> void SymbolTable<E
     for (SymbolVersion &Sym : V.Globals) {
       if (!Sym.HasWildcards)
         continue;
-      Regex Re = compileGlobPatterns({Sym.Name});
+      StringMatcher M({Sym.Name});
       std::vector<SymbolBody *> Syms =
-          Sym.IsExternCpp ? findAllDemangled(Demangled, Re) : findAll(Re);
+          Sym.IsExternCpp ? findAllDemangled(Demangled, M) : findAll(M);
 
       // Exact matching takes precendence over fuzzy matching,
       // so we set a version to a symbol only if no version has been assigned

Modified: lld/trunk/ELF/SymbolTable.h
URL: http://llvm.org/viewvc/llvm-project/lld/trunk/ELF/SymbolTable.h?rev=285949&r1=285948&r2=285949&view=diff
==============================================================================
--- lld/trunk/ELF/SymbolTable.h (original)
+++ lld/trunk/ELF/SymbolTable.h Thu Nov  3 12:57:38 2016
@@ -12,9 +12,9 @@
 
 #include "InputFiles.h"
 #include "LTO.h"
+#include "Strings.h"
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Regex.h"
 
 namespace lld {
 namespace elf {
@@ -92,7 +92,7 @@ public:
   void wrap(StringRef Name);
 
 private:
-  std::vector<SymbolBody *> findAll(const llvm::Regex &Re);
+  std::vector<SymbolBody *> findAll(const StringMatcher &M);
   std::pair<Symbol *, bool> insert(StringRef &Name);
   std::pair<Symbol *, bool> insert(StringRef &Name, uint8_t Type,
                                    uint8_t Visibility, bool CanOmitFromDynSym,




More information about the llvm-commits mailing list