[PATCH] Implement TokenizeWindowsCommandLine.

Rui Ueyama ruiu at google.com
Mon Jul 29 21:36:26 PDT 2013


  - Updated the comment.

Hi rnk,

http://llvm-reviews.chandlerc.com/D1235

CHANGE SINCE LAST DIFF
  http://llvm-reviews.chandlerc.com/D1235?vs=3064&id=3065#toc

Files:
  lib/Support/CommandLine.cpp
  unittests/Support/CommandLineTest.cpp

Index: lib/Support/CommandLine.cpp
===================================================================
--- lib/Support/CommandLine.cpp
+++ lib/Support/CommandLine.cpp
@@ -498,9 +498,111 @@
     NewArgv.push_back(Saver.SaveString(Token.c_str()));
 }
 
+/// Backslashes are interpreted in a rather complicated way in the Windows-style
+/// command line, because backslashes are used both to separate path and to
+/// escape double quote. This method consumes runs of backslashes as well as the
+/// following backslash if it's escaped.
+///
+///  * If an even number of backslashes is followed by a double quote, one
+///    backslash is output for every pair of backslashes, and the last double
+///    quote remains unconsumed. The double quote will later be interpreted as
+///    the start of a quoted string in the main loop outside of this function.
+///
+///  * If an odd number of backslashes is followed by a double quote, one
+///    backslash is output for every pair of backslashes, and a double quote is
+///    output for the last pair of backslash-double quote. The double quote is
+///    consumed in this case.
+///
+///  * Otherwise, backslahses are interpreted literally.
+static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
+  size_t E = Src.size();
+  int BackslashCount = 0;
+  // Skip the backslashes.
+  do {
+    ++I;
+    ++BackslashCount;
+  } while (I != E && Src[I] == '\\');
+
+  bool followedByDoubleQuote = (I != E && Src[I] == '"');
+  if (followedByDoubleQuote) {
+    for (int J = 0; J < BackslashCount / 2; ++J)
+      Token.push_back('\\');
+    if (BackslashCount % 2 == 0)
+      return I - 1;
+    Token.push_back('"');
+    return I;
+  }
+
+  for (int J = 0; J < BackslashCount; ++J)
+    Token.push_back('\\');
+  return I - 1;
+}
+
 void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
                                     SmallVectorImpl<const char *> &NewArgv) {
-  llvm_unreachable("FIXME not implemented");
+  SmallString<128> Token;
+
+  // This is a small state machine to consume characters until it reaches the
+  // end of the source string.
+  enum { INIT, UNQUOTED, QUOTED } state = INIT;
+  for (size_t I = 0, E = Src.size(); I != E; ++I) {
+    // INIT state indicates that the current input index is at the start of
+    // the string or between tokens.
+    if (state == INIT) {
+      if (isWhitespace(Src[I]))
+        continue;
+      if (Src[I] == '"') {
+        state = QUOTED;
+        continue;
+      }
+      if (Src[I] == '\\') {
+        I = parseBackslash(Src, I, Token);
+        state = UNQUOTED;
+        continue;
+      }
+      Token.push_back(Src[I]);
+      state = UNQUOTED;
+      continue;
+    }
+
+    // UNQUOTED state means that it's reading a token not quoted by double
+    // quotes.
+    if (state == UNQUOTED) {
+      // Whitespace means the end of the token.
+      if (isWhitespace(Src[I])) {
+        NewArgv.push_back(Saver.SaveString(Token.c_str()));
+        Token.clear();
+        state = INIT;
+        continue;
+      }
+      if (Src[I] == '"') {
+        state = QUOTED;
+        continue;
+      }
+      if (Src[I] == '\\') {
+        I = parseBackslash(Src, I, Token);
+        continue;
+      }
+      Token.push_back(Src[I]);
+      continue;
+    }
+
+    // QUOTED state means that it's reading a token quoted by double quotes.
+    if (state == QUOTED) {
+      if (Src[I] == '"') {
+        state = UNQUOTED;
+        continue;
+      }
+      if (Src[I] == '\\') {
+        I = parseBackslash(Src, I, Token);
+        continue;
+      }
+      Token.push_back(Src[I]);
+    }
+  }
+  // Append the last token after hitting EOF with no whitespace.
+  if (!Token.empty())
+    NewArgv.push_back(Saver.SaveString(Token.c_str()));
 }
 
 static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
Index: unittests/Support/CommandLineTest.cpp
===================================================================
--- unittests/Support/CommandLineTest.cpp
+++ unittests/Support/CommandLineTest.cpp
@@ -125,21 +125,40 @@
   }
 };
 
-TEST(CommandLineTest, TokenizeGNUCommandLine) {
-  const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' "
-                      "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\"";
-  const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar",
-                                 "foobarbaz", "C:\\src\\foo.cpp",
-                                 "C:\\src\\foo.cpp" };
+typedef void ParserFunction(StringRef Source, llvm::cl::StringSaver &Saver,
+                            SmallVectorImpl<const char *> &NewArgv);
+
+
+void testCommandLineTokenizer(ParserFunction *parse, const char *Input,
+                              const char *const Output[], size_t OutputSize) {
   SmallVector<const char *, 0> Actual;
   StrDupSaver Saver;
-  cl::TokenizeGNUCommandLine(Input, Saver, Actual);
-  EXPECT_EQ(array_lengthof(Output), Actual.size());
+  parse(Input, Saver, Actual);
+  EXPECT_EQ(OutputSize, Actual.size());
   for (unsigned I = 0, E = Actual.size(); I != E; ++I) {
-    if (I < array_lengthof(Output))
+    if (I < OutputSize)
       EXPECT_STREQ(Output[I], Actual[I]);
     free(const_cast<char *>(Actual[I]));
   }
 }
 
+TEST(CommandLineTest, TokenizeGNUCommandLine) {
+  const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' "
+                      "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\"";
+  const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar",
+                                 "foobarbaz", "C:\\src\\foo.cpp",
+                                 "C:\\src\\foo.cpp" };
+  testCommandLineTokenizer(cl::TokenizeGNUCommandLine, Input, Output,
+                           array_lengthof(Output));
+}
+
+TEST(CommandLineTest, TokenizeWindowsCommandLine) {
+  const char *Input = "a\\b c\\\\d e\\\\\"f g\" h\\\"i j\\\\\\\"k \"lmn\" o pqr "
+                      "\"st \\\"u\" \\v";
+  const char *const Output[] = { "a\\b", "c\\\\d", "e\\f g", "h\"i", "j\\\"k",
+                                 "lmn", "o", "pqr", "st \"u", "\\v" };
+  testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input, Output,
+                           array_lengthof(Output));
+}
+
 }  // anonymous namespace
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D1235.2.patch
Type: text/x-patch
Size: 6279 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130729/543df70f/attachment.bin>


More information about the llvm-commits mailing list