[llvm-commits] [llvm] r96503 - in /llvm/trunk: include/llvm/Support/Regex.h lib/Support/Regex.cpp unittests/Support/RegexTest.cpp

Daniel Dunbar daniel at zuster.org
Wed Feb 17 12:08:43 PST 2010


Author: ddunbar
Date: Wed Feb 17 14:08:42 2010
New Revision: 96503

URL: http://llvm.org/viewvc/llvm-project?rev=96503&view=rev
Log:
Add Regex::sub, for doing regular expression substitution with backreferences.

Modified:
    llvm/trunk/include/llvm/Support/Regex.h
    llvm/trunk/lib/Support/Regex.cpp
    llvm/trunk/unittests/Support/RegexTest.cpp

Modified: llvm/trunk/include/llvm/Support/Regex.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Support/Regex.h?rev=96503&r1=96502&r2=96503&view=diff

==============================================================================
--- llvm/trunk/include/llvm/Support/Regex.h (original)
+++ llvm/trunk/include/llvm/Support/Regex.h Wed Feb 17 14:08:42 2010
@@ -56,6 +56,19 @@
     ///
     /// This returns true on a successful match.
     bool match(const StringRef &String, SmallVectorImpl<StringRef> *Matches=0);
+
+    /// sub - Return the result of replacing the first match of the regex in
+    /// \arg String with the \arg Repl string. Backreferences like "\0" in the
+    /// replacement string are replaced with the appropriate match substring.
+    ///
+    /// Note that the replacement string has backslash escaping performed on
+    /// it. Invalid backreferences are ignored (replaced by empty strings).
+    ///
+    /// \param Error If non-null, any errors in the substitution (invalid
+    /// backreferences, trailing backslashes) will be recorded as a non-empty
+    /// string.
+    std::string sub(StringRef Repl, StringRef String, std::string *Error = 0);
+
   private:
     struct llvm_regex *preg;
     int error;

Modified: llvm/trunk/lib/Support/Regex.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Support/Regex.cpp?rev=96503&r1=96502&r2=96503&view=diff

==============================================================================
--- llvm/trunk/lib/Support/Regex.cpp (original)
+++ llvm/trunk/lib/Support/Regex.cpp Wed Feb 17 14:08:42 2010
@@ -90,3 +90,79 @@
 
   return true;
 }
+
+std::string Regex::sub(StringRef Repl, StringRef String,
+                       std::string *Error) {
+  SmallVector<StringRef, 8> Matches;
+
+  // Reset error, if given.
+  if (Error && !Error->empty()) *Error = "";
+
+  // Return the input if there was no match.
+  if (!match(String, &Matches))
+    return String;
+
+  // Otherwise splice in the replacement string, starting with the prefix before
+  // the match.
+  std::string Res(String.begin(), Matches[0].begin());
+
+  // Then the replacement string, honoring possible substitutions.
+  while (!Repl.empty()) {
+    // Skip to the next escape.
+    std::pair<StringRef, StringRef> Split = Repl.split('\\');
+
+    // Add the skipped substring.
+    Res += Split.first;
+
+    // Check for terminimation and trailing backslash.
+    if (Split.second.empty()) {
+      if (Repl.size() != Split.first.size() &&
+          Error && Error->empty())
+        *Error = "replacement string contained trailing backslash";
+      break;
+    }
+
+    // Otherwise update the replacement string and interpret escapes.
+    Repl = Split.second;
+
+    // FIXME: We should have a StringExtras function for mapping C99 escapes.
+    switch (Repl[0]) {
+      // Treat all unrecognized characters as self-quoting.
+    default:
+      Res += Repl[0];
+      Repl = Repl.substr(1);
+      break;
+
+      // Single character escapes.
+    case 't':
+      Res += '\t';
+      Repl = Repl.substr(1);
+      break;
+    case 'n':
+      Res += '\n';
+      Repl = Repl.substr(1);
+      break;
+
+      // Decimal escapes are backreferences.
+    case '0': case '1': case '2': case '3': case '4':
+    case '5': case '6': case '7': case '8': case '9': {
+      // Extract the backreference number.
+      StringRef Ref = Repl.slice(0, Repl.find_first_not_of("0123456789"));
+      Repl = Repl.substr(Ref.size());
+
+      unsigned RefValue;
+      if (!Ref.getAsInteger(10, RefValue) &&
+          RefValue < Matches.size())
+        Res += Matches[RefValue];
+      else if (Error && Error->empty())
+        *Error = "invalid backreference string '" + Ref.str() + "'";
+      break;
+    }
+    }
+  }
+
+  // And finally the suffix.
+  Res += StringRef(Matches[0].end(), String.end() - Matches[0].end());
+
+  return Res;
+}

Modified: llvm/trunk/unittests/Support/RegexTest.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/unittests/Support/RegexTest.cpp?rev=96503&r1=96502&r2=96503&view=diff

==============================================================================
--- llvm/trunk/unittests/Support/RegexTest.cpp (original)
+++ llvm/trunk/unittests/Support/RegexTest.cpp Wed Feb 17 14:08:42 2010
@@ -62,4 +62,33 @@
   EXPECT_TRUE(r5.match(String));
 }
 
+TEST_F(RegexTest, Substitution) {
+  std::string Error;
+
+  EXPECT_EQ("aNUMber", Regex("[0-9]+").sub("NUM", "a1234ber"));
+
+  // Standard Escapes
+  EXPECT_EQ("a\\ber", Regex("[0-9]+").sub("\\\\", "a1234ber", &Error));
+  EXPECT_EQ(Error, "");
+  EXPECT_EQ("a\nber", Regex("[0-9]+").sub("\\n", "a1234ber", &Error));
+  EXPECT_EQ(Error, "");
+  EXPECT_EQ("a\tber", Regex("[0-9]+").sub("\\t", "a1234ber", &Error));
+  EXPECT_EQ(Error, "");
+  EXPECT_EQ("ajber", Regex("[0-9]+").sub("\\j", "a1234ber", &Error));
+  EXPECT_EQ(Error, "");
+
+  EXPECT_EQ("aber", Regex("[0-9]+").sub("\\", "a1234ber", &Error));
+  EXPECT_EQ(Error, "replacement string contained trailing backslash");
+  
+  // Backreferences
+  EXPECT_EQ("aa1234bber", Regex("a[0-9]+b").sub("a\\0b", "a1234ber", &Error));
+  EXPECT_EQ(Error, "");
+
+  EXPECT_EQ("a1234ber", Regex("a([0-9]+)b").sub("a\\1b", "a1234ber", &Error));
+  EXPECT_EQ(Error, "");
+
+  EXPECT_EQ("aber", Regex("a[0-9]+b").sub("a\\100b", "a1234ber", &Error));
+  EXPECT_EQ(Error, "invalid backreference string '100'");
+}
+
 }





More information about the llvm-commits mailing list