[PATCH] Implement TokenizeWindowsCommandLine.
Rui Ueyama
ruiu at google.com
Mon Jul 29 21:14:31 PDT 2013
Hi rnk,
This is a follow up patch for r187390 to implement the parser for the
Windows-style command line. This should follow the rule as described
at http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
http://llvm-reviews.chandlerc.com/D1235
Files:
lib/Support/CommandLine.cpp
unittests/Support/CommandLineTest.cpp
Index: lib/Support/CommandLine.cpp
===================================================================
--- lib/Support/CommandLine.cpp
+++ lib/Support/CommandLine.cpp
@@ -498,9 +498,110 @@
NewArgv.push_back(Saver.SaveString(Token.c_str()));
}
+/// Backslashes are interpreted in a rather complicated way in the Windows-style
+/// command line, because backslashes is used both to separate path and to
+/// escape a double quote.
+///
+/// * If an even number of backslashes is followed by a double quote, one
+/// backslash should be output for every pair of backslashes, and the double
+/// quote should be interpreted as the start of the quoted string.
+/// * If an odd number of backslashes is followed by a double quote, one
+/// backslash should be output for every pair of backslashes, and the last
+/// pair of backslash-double quote should be output as a literal double
+/// quote.
+/// * Otherwise, backslahses are interpreted literally.
+///
+/// This methods consumes runs of backslashes as well as the following backslash
+/// if it's escaped.
+static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
+ size_t E = Src.size();
+ int BackslashCount = 0;
+ // Skip the backslashes.
+ do {
+ ++I;
+ ++BackslashCount;
+ } while (I != E && Src[I] == '\\');
+
+ bool followedByDoubleQuote = (I != E && Src[I] == '"');
+ if (followedByDoubleQuote) {
+ for (int J = 0; J < BackslashCount / 2; ++J)
+ Token.push_back('\\');
+ if (BackslashCount % 2 == 0)
+ return I - 1;
+ Token.push_back('"');
+ return I;
+ }
+
+ for (int J = 0; J < BackslashCount; ++J)
+ Token.push_back('\\');
+ return I - 1;
+}
+
void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
SmallVectorImpl<const char *> &NewArgv) {
- llvm_unreachable("FIXME not implemented");
+ SmallString<128> Token;
+
+ // This is a small state machine to consume characters until it reaches the
+ // end of the source string.
+ enum { INIT, UNQUOTED, QUOTED } state = INIT;
+ for (size_t I = 0, E = Src.size(); I != E; ++I) {
+ // INIT state indicates that the current input index is at the start of
+ // the string or between tokens.
+ if (state == INIT) {
+ if (isWhitespace(Src[I]))
+ continue;
+ if (Src[I] == '"') {
+ state = QUOTED;
+ continue;
+ }
+ if (Src[I] == '\\') {
+ I = parseBackslash(Src, I, Token);
+ state = UNQUOTED;
+ continue;
+ }
+ Token.push_back(Src[I]);
+ state = UNQUOTED;
+ continue;
+ }
+
+ // UNQUOTED state means that it's reading a token not quoted by double
+ // quotes.
+ if (state == UNQUOTED) {
+ // Whitespace means the end of the token.
+ if (isWhitespace(Src[I])) {
+ NewArgv.push_back(Saver.SaveString(Token.c_str()));
+ Token.clear();
+ state = INIT;
+ continue;
+ }
+ if (Src[I] == '"') {
+ state = QUOTED;
+ continue;
+ }
+ if (Src[I] == '\\') {
+ I = parseBackslash(Src, I, Token);
+ continue;
+ }
+ Token.push_back(Src[I]);
+ continue;
+ }
+
+ // QUOTED state means that it's reading a token quoted by double quotes.
+ if (state == QUOTED) {
+ if (Src[I] == '"') {
+ state = UNQUOTED;
+ continue;
+ }
+ if (Src[I] == '\\') {
+ I = parseBackslash(Src, I, Token);
+ continue;
+ }
+ Token.push_back(Src[I]);
+ }
+ }
+ // Append the last token after hitting EOF with no whitespace.
+ if (!Token.empty())
+ NewArgv.push_back(Saver.SaveString(Token.c_str()));
}
static bool ExpandResponseFile(const char *FName, StringSaver &Saver,
Index: unittests/Support/CommandLineTest.cpp
===================================================================
--- unittests/Support/CommandLineTest.cpp
+++ unittests/Support/CommandLineTest.cpp
@@ -125,21 +125,40 @@
}
};
-TEST(CommandLineTest, TokenizeGNUCommandLine) {
- const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' "
- "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\"";
- const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar",
- "foobarbaz", "C:\\src\\foo.cpp",
- "C:\\src\\foo.cpp" };
+typedef void ParserFunction(StringRef Source, llvm::cl::StringSaver &Saver,
+ SmallVectorImpl<const char *> &NewArgv);
+
+
+void testCommandLineTokenizer(ParserFunction *parse, const char *Input,
+ const char *const Output[], size_t OutputSize) {
SmallVector<const char *, 0> Actual;
StrDupSaver Saver;
- cl::TokenizeGNUCommandLine(Input, Saver, Actual);
- EXPECT_EQ(array_lengthof(Output), Actual.size());
+ parse(Input, Saver, Actual);
+ EXPECT_EQ(OutputSize, Actual.size());
for (unsigned I = 0, E = Actual.size(); I != E; ++I) {
- if (I < array_lengthof(Output))
+ if (I < OutputSize)
EXPECT_STREQ(Output[I], Actual[I]);
free(const_cast<char *>(Actual[I]));
}
}
+TEST(CommandLineTest, TokenizeGNUCommandLine) {
+ const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' "
+ "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\"";
+ const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar",
+ "foobarbaz", "C:\\src\\foo.cpp",
+ "C:\\src\\foo.cpp" };
+ testCommandLineTokenizer(cl::TokenizeGNUCommandLine, Input, Output,
+ array_lengthof(Output));
+}
+
+TEST(CommandLineTest, TokenizeWindowsCommandLine) {
+ const char *Input = "a\\b c\\\\d e\\\\\"f g\" h\\\"i j\\\\\\\"k \"lmn\" o pqr "
+ "\"st \\\"u\" \\v";
+ const char *const Output[] = { "a\\b", "c\\\\d", "e\\f g", "h\"i", "j\\\"k",
+ "lmn", "o", "pqr", "st \"u", "\\v" };
+ testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input, Output,
+ array_lengthof(Output));
+}
+
} // anonymous namespace
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D1235.1.patch
Type: text/x-patch
Size: 6172 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20130729/e34b36a5/attachment.bin>
More information about the llvm-commits
mailing list