[llvm] [GlobPattern] Add GlobPattern::longest_substr(). (PR #164512)
    Vitaly Buka via llvm-commits 
    llvm-commits at lists.llvm.org
       
    Tue Oct 21 16:48:30 PDT 2025
    
    
  
https://github.com/vitalybuka updated https://github.com/llvm/llvm-project/pull/164512
>From 65427808c03811ef488d96c076860a9eb59ccd67 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Tue, 21 Oct 2025 15:55:20 -0700
Subject: [PATCH 1/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?=
 =?UTF-8?q?itial=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.6
---
 llvm/include/llvm/Support/GlobPattern.h    | 22 +++++---
 llvm/lib/Support/GlobPattern.cpp           | 57 ++++++++++++++++++---
 llvm/unittests/Support/GlobPatternTest.cpp | 58 ++++++++++++++++++++++
 3 files changed, 122 insertions(+), 15 deletions(-)
diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h
index c1b44849b9794..4824f3fa01e5b 100644
--- a/llvm/include/llvm/Support/GlobPattern.h
+++ b/llvm/include/llvm/Support/GlobPattern.h
@@ -63,22 +63,30 @@ class GlobPattern {
   // Returns true for glob pattern "*". Can be used to avoid expensive
   // preparation/acquisition of the input for match().
   bool isTrivialMatchAll() const {
-    if (!Prefix.empty())
+    if (PrefixSize)
       return false;
-    if (!Suffix.empty())
+    if (SuffixSize)
       return false;
     if (SubGlobs.size() != 1)
       return false;
     return SubGlobs[0].getPat() == "*";
   }
 
-  StringRef prefix() const { return Prefix; }
-  StringRef suffix() const { return Suffix; }
+  // The followind functions are as shortcuts to some matching. They are
+  // conservative to simplify implementations.
 
-private:
-  StringRef Prefix;
-  StringRef Suffix;
+  // Returns plain prefix of the pattern.
+  StringRef prefix() const { return Pattern.take_front(PrefixSize); }
+  // Returns plain suffix of the pattern.
+  StringRef suffix() const { return Pattern.take_back(SuffixSize); }
+  // Returns the longest plain substring of the pattern between of prefix and
+  // suffix.
+  StringRef longest_substr() const;
 
+private:
+  StringRef Pattern;
+  size_t PrefixSize = 0;
+  size_t SuffixSize = 0;
   struct SubGlobPattern {
     /// \param Pat the pattern to match against
     LLVM_ABI static Expected<SubGlobPattern> create(StringRef Pat);
diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp
index 0ecf47dc1d3d1..dfc1508ce63af 100644
--- a/llvm/lib/Support/GlobPattern.cpp
+++ b/llvm/lib/Support/GlobPattern.cpp
@@ -132,24 +132,60 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
   return std::move(SubPatterns);
 }
 
+static StringRef maxPlainSubstring(StringRef S) {
+  StringRef R;
+  while (!S.empty()) {
+    size_t PrefixSize = S.find_first_of("?*[{\\");
+    if (PrefixSize == std::string::npos)
+      PrefixSize = S.size();
+
+    if (R.size() < PrefixSize)
+      R = S.take_front(PrefixSize);
+    S = S.drop_front(PrefixSize);
+
+    switch (S.front()) {
+    case '\\':
+      S = S.drop_front(2);
+      break;
+    case '[': {
+      size_t EndBracket = S.find_first_of("]");
+      if (EndBracket == std::string::npos)
+        return R; // Incorrect, but let SubGlobPattern::create handle it.
+      S = S.drop_front(EndBracket + 1);
+      break;
+    }
+    case '{':
+      // TODO: implement.
+      return {};
+    default:
+      S = S.drop_front(1);
+    }
+  }
+
+  return R;
+}
+
 Expected<GlobPattern>
 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
   GlobPattern Pat;
+  Pat.Pattern = S;
 
   // Store the prefix that does not contain any metacharacter.
-  size_t PrefixSize = S.find_first_of("?*[{\\");
-  Pat.Prefix = S.substr(0, PrefixSize);
-  if (PrefixSize == std::string::npos)
+  Pat.PrefixSize = S.find_first_of("?*[{\\");
+  if (Pat.PrefixSize == std::string::npos) {
+    Pat.PrefixSize = S.size();
     return Pat;
-  S = S.substr(PrefixSize);
+  }
+  S = S.substr(Pat.PrefixSize);
 
   // Just in case we stop on unmatched opening brackets.
   size_t SuffixStart = S.find_last_of("?*[]{}\\");
   assert(SuffixStart != std::string::npos);
   if (S[SuffixStart] == '\\')
     ++SuffixStart;
-  ++SuffixStart;
-  Pat.Suffix = S.substr(SuffixStart);
+  if (SuffixStart < S.size())
+    ++SuffixStart;
+  Pat.SuffixSize = S.size() - SuffixStart;
   S = S.substr(0, SuffixStart);
 
   SmallVector<std::string, 1> SubPats;
@@ -199,10 +235,15 @@ GlobPattern::SubGlobPattern::create(StringRef S) {
   return Pat;
 }
 
+StringRef GlobPattern::longest_substr() const {
+  return maxPlainSubstring(
+      Pattern.drop_front(PrefixSize).drop_back(SuffixSize));
+}
+
 bool GlobPattern::match(StringRef S) const {
-  if (!S.consume_front(Prefix))
+  if (!S.consume_front(prefix()))
     return false;
-  if (!S.consume_back(Suffix))
+  if (!S.consume_back(suffix()))
     return false;
   if (SubGlobs.empty() && S.empty())
     return true;
diff --git a/llvm/unittests/Support/GlobPatternTest.cpp b/llvm/unittests/Support/GlobPatternTest.cpp
index 58fd7678131c6..a0e0d1415f383 100644
--- a/llvm/unittests/Support/GlobPatternTest.cpp
+++ b/llvm/unittests/Support/GlobPatternTest.cpp
@@ -329,6 +329,64 @@ TEST_F(GlobPatternTest, PrefixSuffix) {
   EXPECT_EQ("cd", Pat->suffix());
 }
 
+TEST_F(GlobPatternTest, Substr) {
+  auto Pat = GlobPattern::create("");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("abcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("*abcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("abcd*");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc*d");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bc", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc*def*g");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("def", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd*ef*g");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd*efg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd[ef]g*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde\\fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde\\[fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde?fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcdef{g}*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+}
+
 TEST_F(GlobPatternTest, Pathological) {
   std::string P, S(40, 'a');
   StringRef Pieces[] = {"a*", "[ba]*", "{b*,a*}*"};
>From 8a3b8b6b84aa01cc1668ac07deb737f8800588c8 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Tue, 21 Oct 2025 16:10:05 -0700
Subject: [PATCH 2/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
 =?UTF-8?q?anges=20introduced=20through=20rebase?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.6
[skip ci]
---
 llvm/include/llvm/Support/GlobPattern.h | 18 ++++++++++++------
 llvm/lib/Support/GlobPattern.cpp        | 19 +++++++++++--------
 2 files changed, 23 insertions(+), 14 deletions(-)
diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h
index c1b44849b9794..6ebf64565559b 100644
--- a/llvm/include/llvm/Support/GlobPattern.h
+++ b/llvm/include/llvm/Support/GlobPattern.h
@@ -63,21 +63,27 @@ class GlobPattern {
   // Returns true for glob pattern "*". Can be used to avoid expensive
   // preparation/acquisition of the input for match().
   bool isTrivialMatchAll() const {
-    if (!Prefix.empty())
+    if (PrefixSize)
       return false;
-    if (!Suffix.empty())
+    if (SuffixSize)
       return false;
     if (SubGlobs.size() != 1)
       return false;
     return SubGlobs[0].getPat() == "*";
   }
 
-  StringRef prefix() const { return Prefix; }
-  StringRef suffix() const { return Suffix; }
+  // The following functions are just shortcuts for faster matching. They are
+  // conservative to simplify implementations.
+
+  // Returns plain prefix of the pattern.
+  StringRef prefix() const { return Pattern.take_front(PrefixSize); }
+  // Returns plain suffix of the pattern.
+  StringRef suffix() const { return Pattern.take_back(SuffixSize); }
 
 private:
-  StringRef Prefix;
-  StringRef Suffix;
+  StringRef Pattern;
+  size_t PrefixSize = 0;
+  size_t SuffixSize = 0;
 
   struct SubGlobPattern {
     /// \param Pat the pattern to match against
diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp
index 0ecf47dc1d3d1..f56a8fcf4bf9d 100644
--- a/llvm/lib/Support/GlobPattern.cpp
+++ b/llvm/lib/Support/GlobPattern.cpp
@@ -135,21 +135,24 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
 Expected<GlobPattern>
 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
   GlobPattern Pat;
+  Pat.Pattern = S;
 
   // Store the prefix that does not contain any metacharacter.
-  size_t PrefixSize = S.find_first_of("?*[{\\");
-  Pat.Prefix = S.substr(0, PrefixSize);
-  if (PrefixSize == std::string::npos)
+  Pat.PrefixSize = S.find_first_of("?*[{\\");
+  if (Pat.PrefixSize == std::string::npos) {
+    Pat.PrefixSize = S.size();
     return Pat;
-  S = S.substr(PrefixSize);
+  }
+  S = S.substr(Pat.PrefixSize);
 
   // Just in case we stop on unmatched opening brackets.
   size_t SuffixStart = S.find_last_of("?*[]{}\\");
   assert(SuffixStart != std::string::npos);
   if (S[SuffixStart] == '\\')
     ++SuffixStart;
-  ++SuffixStart;
-  Pat.Suffix = S.substr(SuffixStart);
+  if (SuffixStart < S.size())
+    ++SuffixStart;
+  Pat.SuffixSize = S.size() - SuffixStart;
   S = S.substr(0, SuffixStart);
 
   SmallVector<std::string, 1> SubPats;
@@ -200,9 +203,9 @@ GlobPattern::SubGlobPattern::create(StringRef S) {
 }
 
 bool GlobPattern::match(StringRef S) const {
-  if (!S.consume_front(Prefix))
+  if (!S.consume_front(prefix()))
     return false;
-  if (!S.consume_back(Suffix))
+  if (!S.consume_back(suffix()))
     return false;
   if (SubGlobs.empty() && S.empty())
     return true;
>From 260c56f38d95ca1678402399955dea8e11d8cd89 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka at google.com>
Date: Tue, 21 Oct 2025 16:42:08 -0700
Subject: [PATCH 3/3] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20ch?=
 =?UTF-8?q?anges=20introduced=20through=20rebase?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Created using spr 1.3.6
[skip ci]
---
 llvm/include/llvm/Support/GlobPattern.h | 18 ++++++++++++------
 llvm/lib/Support/GlobPattern.cpp        | 19 +++++++++++--------
 2 files changed, 23 insertions(+), 14 deletions(-)
diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h
index c1b44849b9794..6ebf64565559b 100644
--- a/llvm/include/llvm/Support/GlobPattern.h
+++ b/llvm/include/llvm/Support/GlobPattern.h
@@ -63,21 +63,27 @@ class GlobPattern {
   // Returns true for glob pattern "*". Can be used to avoid expensive
   // preparation/acquisition of the input for match().
   bool isTrivialMatchAll() const {
-    if (!Prefix.empty())
+    if (PrefixSize)
       return false;
-    if (!Suffix.empty())
+    if (SuffixSize)
       return false;
     if (SubGlobs.size() != 1)
       return false;
     return SubGlobs[0].getPat() == "*";
   }
 
-  StringRef prefix() const { return Prefix; }
-  StringRef suffix() const { return Suffix; }
+  // The following functions are just shortcuts for faster matching. They are
+  // conservative to simplify implementations.
+
+  // Returns plain prefix of the pattern.
+  StringRef prefix() const { return Pattern.take_front(PrefixSize); }
+  // Returns plain suffix of the pattern.
+  StringRef suffix() const { return Pattern.take_back(SuffixSize); }
 
 private:
-  StringRef Prefix;
-  StringRef Suffix;
+  StringRef Pattern;
+  size_t PrefixSize = 0;
+  size_t SuffixSize = 0;
 
   struct SubGlobPattern {
     /// \param Pat the pattern to match against
diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp
index 0ecf47dc1d3d1..f56a8fcf4bf9d 100644
--- a/llvm/lib/Support/GlobPattern.cpp
+++ b/llvm/lib/Support/GlobPattern.cpp
@@ -135,21 +135,24 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
 Expected<GlobPattern>
 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
   GlobPattern Pat;
+  Pat.Pattern = S;
 
   // Store the prefix that does not contain any metacharacter.
-  size_t PrefixSize = S.find_first_of("?*[{\\");
-  Pat.Prefix = S.substr(0, PrefixSize);
-  if (PrefixSize == std::string::npos)
+  Pat.PrefixSize = S.find_first_of("?*[{\\");
+  if (Pat.PrefixSize == std::string::npos) {
+    Pat.PrefixSize = S.size();
     return Pat;
-  S = S.substr(PrefixSize);
+  }
+  S = S.substr(Pat.PrefixSize);
 
   // Just in case we stop on unmatched opening brackets.
   size_t SuffixStart = S.find_last_of("?*[]{}\\");
   assert(SuffixStart != std::string::npos);
   if (S[SuffixStart] == '\\')
     ++SuffixStart;
-  ++SuffixStart;
-  Pat.Suffix = S.substr(SuffixStart);
+  if (SuffixStart < S.size())
+    ++SuffixStart;
+  Pat.SuffixSize = S.size() - SuffixStart;
   S = S.substr(0, SuffixStart);
 
   SmallVector<std::string, 1> SubPats;
@@ -200,9 +203,9 @@ GlobPattern::SubGlobPattern::create(StringRef S) {
 }
 
 bool GlobPattern::match(StringRef S) const {
-  if (!S.consume_front(Prefix))
+  if (!S.consume_front(prefix()))
     return false;
-  if (!S.consume_back(Suffix))
+  if (!S.consume_back(suffix()))
     return false;
   if (SubGlobs.empty() && S.empty())
     return true;
    
    
More information about the llvm-commits
mailing list