[llvm] 8daace8 - [GlobPattern] Support brace expansions

Ellis Hoag via llvm-commits llvm-commits at lists.llvm.org
Wed Aug 30 08:30:55 PDT 2023


Author: Ellis Hoag
Date: 2023-08-30T08:30:39-07:00
New Revision: 8daace8b2d89b50daeeb2634ef96c915d1800e5f

URL: https://github.com/llvm/llvm-project/commit/8daace8b2d89b50daeeb2634ef96c915d1800e5f
DIFF: https://github.com/llvm/llvm-project/commit/8daace8b2d89b50daeeb2634ef96c915d1800e5f.diff

LOG: [GlobPattern] Support brace expansions

Extend `GlobPattern` to support brace expansions, e.g., `foo.{c,cpp}` as discussed in https://reviews.llvm.org/D152762#4425203.

The high level change was to turn `Tokens` into a list that gets larger when we see a new brace expansion term. Then in `GlobPattern::match()` we must check against each token group.

This is a breaking change since `{` will no longer match a literal without escaping. However, `\{` will match the literal `{` before and after this change. Also, from a brief survey of LLVM, it seems that `GlobPattern` is mostly used for symbol and path matching, which likely won't need `{` in their patterns.

See https://github.com/devongovett/glob-match#syntax for a nice glob reference.

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D153587

Added: 
    

Modified: 
    llvm/include/llvm/Support/GlobPattern.h
    llvm/lib/Support/GlobPattern.cpp
    llvm/unittests/Support/GlobPatternTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h
index 26eb43b04bdf90..eb4735f39c8e54 100644
--- a/llvm/include/llvm/Support/GlobPattern.h
+++ b/llvm/include/llvm/Support/GlobPattern.h
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements a glob pattern matcher. The glob pattern is the
-// rule used by the shell.
+// This file implements a glob pattern matcher.
 //
 //===----------------------------------------------------------------------===//
 
@@ -20,30 +19,72 @@
 #include "llvm/Support/Error.h"
 #include <optional>
 
-// This class represents a glob pattern. Supported metacharacters
-// are "*", "?", "\", "[<chars>]", "[^<chars>]", and "[!<chars>]".
 namespace llvm {
 
+/// This class implements a glob pattern matcher similar to the one found in
+/// bash, but with some key 
diff erences. Namely, that \p "*" matches all
+/// characters and does not exclude path separators.
+///
+/// * \p "?" matches a single character.
+/// * \p "*" matches zero or more characters.
+/// * \p "[<chars>]" matches one character in the bracket. Character ranges,
+///   e.g., \p "[a-z]", and negative sets via \p "[^ab]" or \p "[!ab]" are also
+///   supported.
+/// * \p "{<glob>,...}" matches one of the globs in the list. Nested brace
+///   expansions are not supported. If \p MaxSubPatterns is empty then
+///   characters \p "{,}" are treated as literals.
+/// * \p "\" escapes the next character so it is treated as a literal.
+///
+/// Some known edge cases are:
+/// * \p "]" is allowed as the first character in a character class, i.e.,
+///   \p "[]]" is valid and matches the literal \p "]".
+/// * The empty character class, i.e., \p "[]", is invalid.
+/// * Empty or singleton brace expansions, e.g., \p "{}", \p "{a}", are invalid.
+/// * \p "}" and \p "," that are not inside a brace expansion are taken as
+///   literals, e.g., \p ",}" is valid but \p "{" is not.
+///
+/// For example, \p "*[/\\]foo.{c,cpp}" will match (unix or windows) paths to
+/// all files named \p "foo.c" or \p "foo.cpp".
 class GlobPattern {
 public:
-  static Expected<GlobPattern> create(StringRef Pat);
+  /// \param Pat the pattern to match against
+  /// \param MaxSubPatterns if provided limit the number of allowed subpatterns
+  ///                       created from expanding braces otherwise disable
+  ///                       brace expansion
+  static Expected<GlobPattern>
+  create(StringRef Pat, std::optional<size_t> MaxSubPatterns = {});
+  /// \returns \p true if \p S matches this glob pattern
   bool match(StringRef S) const;
 
   // Returns true for glob pattern "*". Can be used to avoid expensive
   // preparation/acquisition of the input for match().
-  bool isTrivialMatchAll() const { return Prefix.empty() && Pat == "*"; }
+  bool isTrivialMatchAll() const {
+    if (!Prefix.empty())
+      return false;
+    if (SubGlobs.size() != 1)
+      return false;
+    return SubGlobs[0].getPat() == "*";
+  }
 
 private:
-  bool matchOne(StringRef Str) const;
+  StringRef Prefix;
 
-  // Brackets with their end position and matched bytes.
-  struct Bracket {
-    const char *Next;
-    BitVector Bytes;
-  };
-  SmallVector<Bracket, 0> Brackets;
+  struct SubGlobPattern {
+    /// \param Pat the pattern to match against
+    static Expected<SubGlobPattern> create(StringRef Pat);
+    /// \returns \p true if \p S matches this glob pattern
+    bool match(StringRef S) const;
+    StringRef getPat() const { return StringRef(Pat.data(), Pat.size()); }
 
-  StringRef Prefix, Pat;
+    // Brackets with their end position and matched bytes.
+    struct Bracket {
+      size_t NextOffset;
+      BitVector Bytes;
+    };
+    SmallVector<Bracket, 0> Brackets;
+    SmallVector<char, 0> Pat;
+  };
+  SmallVector<SubGlobPattern, 1> SubGlobs;
 };
 }
 

diff  --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp
index 74e9bc88de4720..8c29451b620b23 100644
--- a/llvm/lib/Support/GlobPattern.cpp
+++ b/llvm/lib/Support/GlobPattern.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/GlobPattern.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Errc.h"
 
@@ -54,18 +53,115 @@ static Expected<BitVector> expand(StringRef S, StringRef Original) {
   return BV;
 }
 
-Expected<GlobPattern> GlobPattern::create(StringRef S) {
+// Identify brace expansions in S and return the list of patterns they expand
+// into.
+static Expected<SmallVector<std::string, 1>>
+parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
+  SmallVector<std::string> SubPatterns = {S.str()};
+  if (!MaxSubPatterns || !S.contains('{'))
+    return SubPatterns;
+
+  struct BraceExpansion {
+    size_t Start;
+    size_t Length;
+    SmallVector<StringRef, 2> Terms;
+  };
+  SmallVector<BraceExpansion, 0> BraceExpansions;
+
+  BraceExpansion *CurrentBE = nullptr;
+  size_t TermBegin;
+  for (size_t I = 0, E = S.size(); I != E; ++I) {
+    if (S[I] == '[') {
+      I = S.find(']', I + 2);
+      if (I == std::string::npos)
+        return make_error<StringError>("invalid glob pattern, unmatched '['",
+                                       errc::invalid_argument);
+    } else if (S[I] == '{') {
+      if (CurrentBE)
+        return make_error<StringError>(
+            "nested brace expansions are not supported",
+            errc::invalid_argument);
+      CurrentBE = &BraceExpansions.emplace_back();
+      CurrentBE->Start = I;
+      TermBegin = I + 1;
+    } else if (S[I] == ',') {
+      if (!CurrentBE)
+        continue;
+      CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
+      TermBegin = I + 1;
+    } else if (S[I] == '}') {
+      if (!CurrentBE)
+        continue;
+      if (CurrentBE->Terms.empty())
+        return make_error<StringError>(
+            "empty or singleton brace expansions are not supported",
+            errc::invalid_argument);
+      CurrentBE->Terms.push_back(S.substr(TermBegin, I - TermBegin));
+      CurrentBE->Length = I - CurrentBE->Start + 1;
+      CurrentBE = nullptr;
+    } else if (S[I] == '\\') {
+      if (++I == E)
+        return make_error<StringError>("invalid glob pattern, stray '\\'",
+                                       errc::invalid_argument);
+    }
+  }
+  if (CurrentBE)
+    return make_error<StringError>("incomplete brace expansion",
+                                   errc::invalid_argument);
+
+  size_t NumSubPatterns = 1;
+  for (auto &BE : BraceExpansions) {
+    if (NumSubPatterns > std::numeric_limits<size_t>::max() / BE.Terms.size()) {
+      NumSubPatterns = std::numeric_limits<size_t>::max();
+      break;
+    }
+    NumSubPatterns *= BE.Terms.size();
+  }
+  if (NumSubPatterns > *MaxSubPatterns)
+    return make_error<StringError>("too many brace expansions",
+                                   errc::invalid_argument);
+  // Replace brace expansions in reverse order so that we don't invalidate
+  // earlier start indices
+  for (auto &BE : reverse(BraceExpansions)) {
+    SmallVector<std::string> OrigSubPatterns;
+    std::swap(SubPatterns, OrigSubPatterns);
+    for (StringRef Term : BE.Terms)
+      for (StringRef Orig : OrigSubPatterns)
+        SubPatterns.emplace_back(Orig).replace(BE.Start, BE.Length, Term);
+  }
+  return SubPatterns;
+}
+
+Expected<GlobPattern>
+GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
   GlobPattern Pat;
 
   // Store the prefix that does not contain any metacharacter.
-  size_t PrefixSize = S.find_first_of("?*[\\");
+  size_t PrefixSize = S.find_first_of("?*[{\\");
   Pat.Prefix = S.substr(0, PrefixSize);
   if (PrefixSize == std::string::npos)
     return Pat;
   S = S.substr(PrefixSize);
 
+  SmallVector<std::string, 1> SubPats;
+  if (auto Err = parseBraceExpansions(S, MaxSubPatterns).moveInto(SubPats))
+    return Err;
+  for (StringRef SubPat : SubPats) {
+    auto SubGlobOrErr = SubGlobPattern::create(SubPat);
+    if (!SubGlobOrErr)
+      return SubGlobOrErr.takeError();
+    Pat.SubGlobs.push_back(*SubGlobOrErr);
+  }
+
+  return Pat;
+}
+
+Expected<GlobPattern::SubGlobPattern>
+GlobPattern::SubGlobPattern::create(StringRef S) {
+  SubGlobPattern Pat;
+
   // Parse brackets.
-  Pat.Pat = S;
+  Pat.Pat.assign(S.begin(), S.end());
   for (size_t I = 0, E = S.size(); I != E; ++I) {
     if (S[I] == '[') {
       // ']' is allowed as the first character of a character class. '[]' is
@@ -83,7 +179,7 @@ Expected<GlobPattern> GlobPattern::create(StringRef S) {
         return BV.takeError();
       if (Invert)
         BV->flip();
-      Pat.Brackets.push_back(Bracket{S.data() + J + 1, std::move(*BV)});
+      Pat.Brackets.push_back(Bracket{J + 1, std::move(*BV)});
       I = J;
     } else if (S[I] == '\\') {
       if (++I == E)
@@ -95,13 +191,20 @@ Expected<GlobPattern> GlobPattern::create(StringRef S) {
 }
 
 bool GlobPattern::match(StringRef S) const {
-  return S.consume_front(Prefix) && matchOne(S);
+  if (!S.consume_front(Prefix))
+    return false;
+  if (SubGlobs.empty() && S.empty())
+    return true;
+  for (auto &Glob : SubGlobs)
+    if (Glob.match(S))
+      return true;
+  return false;
 }
 
 // Factor the pattern into segments split by '*'. The segment is matched
 // sequentianlly by finding the first occurrence past the end of the previous
 // match.
-bool GlobPattern::matchOne(StringRef Str) const {
+bool GlobPattern::SubGlobPattern::match(StringRef Str) const {
   const char *P = Pat.data(), *SegmentBegin = nullptr, *S = Str.data(),
              *SavedS = S;
   const char *const PEnd = P + Pat.size(), *const End = S + Str.size();
@@ -118,7 +221,7 @@ bool GlobPattern::matchOne(StringRef Str) const {
       continue;
     } else if (*P == '[') {
       if (Brackets[B].Bytes[uint8_t(*S)]) {
-        P = Brackets[B++].Next;
+        P = Pat.data() + Brackets[B++].NextOffset;
         ++S;
         continue;
       }
@@ -143,5 +246,5 @@ bool GlobPattern::matchOne(StringRef Str) const {
   }
   // All bytes in Str have been matched. Return true if the rest part of Pat is
   // empty or contains only '*'.
-  return Pat.find_first_not_of('*', P - Pat.data()) == std::string::npos;
+  return getPat().find_first_not_of('*', P - Pat.data()) == std::string::npos;
 }

diff  --git a/llvm/unittests/Support/GlobPatternTest.cpp b/llvm/unittests/Support/GlobPatternTest.cpp
index 1408eaab50cc5f..e4f1025b009569 100644
--- a/llvm/unittests/Support/GlobPatternTest.cpp
+++ b/llvm/unittests/Support/GlobPatternTest.cpp
@@ -52,6 +52,17 @@ TEST_F(GlobPatternTest, Escape) {
   EXPECT_FALSE(Pat2->match("axxc"));
   EXPECT_FALSE(Pat2->match(""));
 
+  auto Pat3 = GlobPattern::create("\\{");
+  ASSERT_TRUE((bool)Pat3);
+  EXPECT_TRUE(Pat3->match("{"));
+  EXPECT_FALSE(Pat3->match("\\{"));
+  EXPECT_FALSE(Pat3->match(""));
+
+  auto Pat4 = GlobPattern::create("\\a");
+  ASSERT_TRUE((bool)Pat4);
+  EXPECT_TRUE(Pat4->match("a"));
+  EXPECT_FALSE(Pat4->match("\\a"));
+
   for (size_t I = 0; I != 4; ++I) {
     std::string S(I, '\\');
     Expected<GlobPattern> Pat = GlobPattern::create(S);
@@ -122,12 +133,15 @@ TEST_F(GlobPatternTest, BracketFrontOfCharacterClass) {
 }
 
 TEST_F(GlobPatternTest, SpecialCharsInCharacterClass) {
-  Expected<GlobPattern> Pat1 = GlobPattern::create("[*?^]");
-  EXPECT_TRUE((bool)Pat1);
+  auto Pat1 = GlobPattern::create("[*?^{},]");
+  ASSERT_TRUE((bool)Pat1);
   EXPECT_TRUE(Pat1->match("*"));
   EXPECT_TRUE(Pat1->match("?"));
   EXPECT_TRUE(Pat1->match("^"));
-  EXPECT_FALSE(Pat1->match("*?^"));
+  EXPECT_TRUE(Pat1->match("{"));
+  EXPECT_TRUE(Pat1->match("}"));
+  EXPECT_TRUE(Pat1->match(","));
+  EXPECT_FALSE(Pat1->match("*?^{},"));
   EXPECT_FALSE(Pat1->match(""));
 
   Expected<GlobPattern> Pat2 = GlobPattern::create("[*]");
@@ -137,13 +151,73 @@ TEST_F(GlobPatternTest, SpecialCharsInCharacterClass) {
 }
 
 TEST_F(GlobPatternTest, Invalid) {
-  Expected<GlobPattern> Pat1 = GlobPattern::create("[");
+  for (const auto &InvalidPattern : {"[", "[]"}) {
+    auto Pat1 = GlobPattern::create(InvalidPattern);
+    EXPECT_FALSE((bool)Pat1) << "Expected invalid pattern: " << InvalidPattern;
+    handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {});
+  }
+}
+
+TEST_F(GlobPatternTest, InvalidBraceExpansion) {
+  for (const auto &InvalidPattern :
+       {"{", "{{", "{\\", "{\\}", "{}", "{a}", "[{}"}) {
+    auto Pat1 = GlobPattern::create(InvalidPattern, /*MaxSubPatterns=*/1024);
+    EXPECT_FALSE((bool)Pat1) << "Expected invalid pattern: " << InvalidPattern;
+    handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {});
+  }
+  auto Pat1 = GlobPattern::create("{a,b}{c,d}{e,f}", /*MaxSubPatterns=*/7);
   EXPECT_FALSE((bool)Pat1);
   handleAllErrors(Pat1.takeError(), [&](ErrorInfoBase &EIB) {});
+}
 
-  Expected<GlobPattern> Pat2 = GlobPattern::create("[]");
-  EXPECT_FALSE((bool)Pat2);
-  handleAllErrors(Pat2.takeError(), [&](ErrorInfoBase &EIB) {});
+TEST_F(GlobPatternTest, BraceExpansion) {
+  auto Pat1 = GlobPattern::create("{a,b}{1,2}", /*MaxSubPatterns=*/1024);
+  ASSERT_TRUE((bool)Pat1);
+  EXPECT_TRUE(Pat1->match("a1"));
+  EXPECT_TRUE(Pat1->match("a2"));
+  EXPECT_TRUE(Pat1->match("b1"));
+  EXPECT_TRUE(Pat1->match("b2"));
+  EXPECT_FALSE(Pat1->match("ab"));
+
+  auto Pat2 = GlobPattern::create(",}{foo,\\,\\},z*}", /*MaxSubPatterns=*/1024);
+  ASSERT_TRUE((bool)Pat2);
+  EXPECT_TRUE(Pat2->match(",}foo"));
+  EXPECT_TRUE(Pat2->match(",},}"));
+  EXPECT_TRUE(Pat2->match(",}z"));
+  EXPECT_TRUE(Pat2->match(",}zoo"));
+  EXPECT_FALSE(Pat2->match(",}fooz"));
+  EXPECT_FALSE(Pat2->match("foo"));
+  EXPECT_FALSE(Pat2->match(""));
+
+  // This test breaks if we store terms separately and attempt to match them one
+  // by one instead of using subglobs
+  auto Pat3 = GlobPattern::create("{a,ab}b", /*MaxSubPatterns=*/1024);
+  ASSERT_TRUE((bool)Pat3);
+  EXPECT_TRUE(Pat3->match("ab"));
+  EXPECT_TRUE(Pat3->match("abb"));
+}
+
+TEST_F(GlobPatternTest, NoBraceExpansion) {
+  auto Pat1 = GlobPattern::create("{a,b}{1,2}");
+  ASSERT_TRUE((bool)Pat1);
+  EXPECT_TRUE(Pat1->match("{a,b}{1,2}"));
+  EXPECT_FALSE(Pat1->match("a1"));
+
+  auto Pat2 = GlobPattern::create("{{");
+  ASSERT_TRUE((bool)Pat2);
+  EXPECT_TRUE(Pat2->match("{{"));
+}
+
+TEST_F(GlobPatternTest, BraceExpansionCharacterClass) {
+  // Matches mangled names of C++ standard library functions
+  auto Pat =
+      GlobPattern::create("_Z{N,NK,}S[tabsiod]*", /*MaxSubPatterns=*/1024);
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_TRUE(Pat->match("_ZNSt6vectorIiSaIiEE9push_backEOi"));
+  EXPECT_TRUE(Pat->match("_ZNKStfoo"));
+  EXPECT_TRUE(Pat->match("_ZNSafoo"));
+  EXPECT_TRUE(Pat->match("_ZStfoo"));
+  EXPECT_FALSE(Pat->match("_Zfoo"));
 }
 
 TEST_F(GlobPatternTest, ExtSym) {
@@ -169,7 +243,7 @@ TEST_F(GlobPatternTest, IsTrivialMatchAll) {
 }
 
 TEST_F(GlobPatternTest, NUL) {
-  for (char C : "?*{") {
+  for (char C : "?*") {
     std::string S(1, C);
     Expected<GlobPattern> Pat = GlobPattern::create(S);
     ASSERT_TRUE((bool)Pat);
@@ -185,13 +259,14 @@ TEST_F(GlobPatternTest, NUL) {
 
 TEST_F(GlobPatternTest, Pathological) {
   std::string P, S(40, 'a');
+  StringRef Pieces[] = {"a*", "[ba]*", "{b*,a*}*"};
   for (int I = 0; I != 30; ++I)
-    P += I % 2 ? "a*" : "[ba]*";
-  Expected<GlobPattern> Pat = GlobPattern::create(P);
+    P += Pieces[I % 3];
+  Expected<GlobPattern> Pat = GlobPattern::create(P, /*MaxSubPatterns=*/1024);
   ASSERT_TRUE((bool)Pat);
   EXPECT_TRUE(Pat->match(S));
   P += 'b';
-  Pat = GlobPattern::create(P);
+  Pat = GlobPattern::create(P, /*MaxSubPatterns=*/1024);
   ASSERT_TRUE((bool)Pat);
   EXPECT_FALSE(Pat->match(S));
   EXPECT_TRUE(Pat->match(S + 'b'));


        


More information about the llvm-commits mailing list