[llvm] 32814df - [Windows] Fix handling of \" in program name on cmd line.

Simon Tatham via llvm-commits llvm-commits at lists.llvm.org
Tue May 3 03:58:25 PDT 2022


Author: Simon Tatham
Date: 2022-05-03T11:57:50+01:00
New Revision: 32814df442690d4673759296d850804773a7ea5b

URL: https://github.com/llvm/llvm-project/commit/32814df442690d4673759296d850804773a7ea5b
DIFF: https://github.com/llvm/llvm-project/commit/32814df442690d4673759296d850804773a7ea5b.diff

LOG: [Windows] Fix handling of \" in program name on cmd line.

Bugzilla #47579: if you invoke clang on Windows via a pathname in
which a quoted section closes just after a backslash, e.g.

  "C:\Program Files\Whatever\"clang.exe

then cmd.exe and CreateProcess will correctly find the binary, because
when they parse the program name at the start of the command line,
they don't regard the \ before the " as having any kind of escaping
effect. This is different from the behaviour of the Windows standard C
library when it parses the rest of the command line, which would
consider that \" not to close the quoted string.

But this confuses windows::GetCommandLineArguments, because the
Windows API function GetCommandLineW() will return a command line
containing that \" sequence, and cl::TokenizeWindowsCommandLine will
tokenize the whole string according to the C library's rules. So it
will misidentify where the program name stops and the arguments start.

To fix this, I've introduced a new variant function
cl::TokenizeWindowsCommandLineFull(), intended to be applied to the
string returned from GetCommandLineW(). It parses the first word of
the command line according to CreateProcess's rules, considering \ to
never be an escaping character; thereafter, it switches over to the C
library rules for the rest of the command line.

Reviewed By: hans

Differential Revision: https://reviews.llvm.org/D122914

Added: 
    

Modified: 
    llvm/include/llvm/Support/CommandLine.h
    llvm/lib/Support/CommandLine.cpp
    llvm/lib/Support/Windows/Process.inc
    llvm/unittests/Support/CommandLineTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index 3765aca33066c..6461164fceffb 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -1996,12 +1996,15 @@ void TokenizeGNUCommandLine(StringRef Source, StringSaver &Saver,
                             SmallVectorImpl<const char *> &NewArgv,
                             bool MarkEOLs = false);
 
-/// Tokenizes a Windows command line which may contain quotes and escaped
-/// quotes.
+/// Tokenizes a string of Windows command line arguments, which may contain
+/// quotes and escaped quotes.
 ///
 /// See MSDN docs for CommandLineToArgvW for information on the quoting rules.
 /// http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx
 ///
+/// For handling a full Windows command line including the executable name at
+/// the start, see TokenizeWindowsCommandLineFull below.
+///
 /// \param [in] Source The string to be split on whitespace with quotes.
 /// \param [in] Saver Delegates back to the caller for saving parsed strings.
 /// \param [in] MarkEOLs true if tokenizing a response file and you want end of
@@ -2018,6 +2021,23 @@ void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
 void TokenizeWindowsCommandLineNoCopy(StringRef Source, StringSaver &Saver,
                                       SmallVectorImpl<StringRef> &NewArgv);
 
+/// Tokenizes a Windows full command line, including command name at the start.
+///
+/// This uses the same syntax rules as TokenizeWindowsCommandLine for all but
+/// the first token. But the first token is expected to be parsed as the
+/// executable file name in the way CreateProcess would do it, rather than the
+/// way the C library startup code would do it: CreateProcess does not consider
+/// that \ is ever an escape character (because " is not a valid filename char,
+/// hence there's never a need to escape it to be used literally).
+///
+/// Parameters are the same as for TokenizeWindowsCommandLine. In particular,
+/// if you set MarkEOLs = true, then the first word of every line will be
+/// parsed using the special rules for command names, making this function
+/// suitable for parsing a file full of commands to execute.
+void TokenizeWindowsCommandLineFull(StringRef Source, StringSaver &Saver,
+                                    SmallVectorImpl<const char *> &NewArgv,
+                                    bool MarkEOLs = false);
+
 /// String tokenization function type.  Should be compatible with either
 /// Windows or Unix command line tokenizers.
 using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver,

diff  --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 2f749bf7a6989..3e5fff960e9c1 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -918,21 +918,34 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
   return I - 1;
 }
 
-// Windows treats whitespace, double quotes, and backslashes specially.
+// Windows treats whitespace, double quotes, and backslashes specially, except
+// when parsing the first token of a full command line, in which case
+// backslashes are not special.
 static bool isWindowsSpecialChar(char C) {
   return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
 }
+static bool isWindowsSpecialCharInCommandName(char C) {
+  return isWhitespaceOrNull(C) || C == '\"';
+}
 
 // Windows tokenization implementation. The implementation is designed to be
 // inlined and specialized for the two user entry points.
-static inline void
-tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
-                               function_ref<void(StringRef)> AddToken,
-                               bool AlwaysCopy, function_ref<void()> MarkEOL) {
+static inline void tokenizeWindowsCommandLineImpl(
+    StringRef Src, StringSaver &Saver, function_ref<void(StringRef)> AddToken,
+    bool AlwaysCopy, function_ref<void()> MarkEOL, bool InitialCommandName) {
   SmallString<128> Token;
 
+  // Sometimes, this function will be handling a full command line including an
+  // executable pathname at the start. In that situation, the initial pathname
+  // needs 
diff erent handling from the following arguments, because when
+  // CreateProcess or cmd.exe scans the pathname, it doesn't treat \ as
+  // escaping the quote character, whereas when libc scans the rest of the
+  // command line, it does.
+  bool CommandName = InitialCommandName;
+
   // Try to do as much work inside the state machine as possible.
   enum { INIT, UNQUOTED, QUOTED } State = INIT;
+
   for (size_t I = 0, E = Src.size(); I < E; ++I) {
     switch (State) {
     case INIT: {
@@ -947,19 +960,29 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
       if (I >= E)
         break;
       size_t Start = I;
-      while (I < E && !isWindowsSpecialChar(Src[I]))
-        ++I;
+      if (CommandName) {
+        while (I < E && !isWindowsSpecialCharInCommandName(Src[I]))
+          ++I;
+      } else {
+        while (I < E && !isWindowsSpecialChar(Src[I]))
+          ++I;
+      }
       StringRef NormalChars = Src.slice(Start, I);
       if (I >= E || isWhitespaceOrNull(Src[I])) {
         // No special characters: slice out the substring and start the next
         // token. Copy the string if the caller asks us to.
         AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
-        if (I < E && Src[I] == '\n')
+        if (I < E && Src[I] == '\n') {
           MarkEOL();
+          CommandName = InitialCommandName;
+        } else {
+          CommandName = false;
+        }
       } else if (Src[I] == '\"') {
         Token += NormalChars;
         State = QUOTED;
       } else if (Src[I] == '\\') {
+        assert(!CommandName && "or else we'd have treated it as a normal char");
         Token += NormalChars;
         I = parseBackslash(Src, I, Token);
         State = UNQUOTED;
@@ -976,12 +999,16 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
         // token.
         AddToken(Saver.save(Token.str()));
         Token.clear();
-        if (Src[I] == '\n')
+        if (Src[I] == '\n') {
+          CommandName = InitialCommandName;
           MarkEOL();
+        } else {
+          CommandName = false;
+        }
         State = INIT;
       } else if (Src[I] == '\"') {
         State = QUOTED;
-      } else if (Src[I] == '\\') {
+      } else if (Src[I] == '\\' && !CommandName) {
         I = parseBackslash(Src, I, Token);
       } else {
         Token.push_back(Src[I]);
@@ -999,7 +1026,7 @@ tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
           // Otherwise, end the quoted portion and return to the unquoted state.
           State = UNQUOTED;
         }
-      } else if (Src[I] == '\\') {
+      } else if (Src[I] == '\\' && !CommandName) {
         I = parseBackslash(Src, I, Token);
       } else {
         Token.push_back(Src[I]);
@@ -1021,7 +1048,7 @@ void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
       NewArgv.push_back(nullptr);
   };
   tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
-                                 /*AlwaysCopy=*/true, OnEOL);
+                                 /*AlwaysCopy=*/true, OnEOL, false);
 }
 
 void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
@@ -1029,7 +1056,19 @@ void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
   auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
   auto OnEOL = []() {};
   tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
-                                 OnEOL);
+                                 OnEOL, false);
+}
+
+void cl::TokenizeWindowsCommandLineFull(StringRef Src, StringSaver &Saver,
+                                        SmallVectorImpl<const char *> &NewArgv,
+                                        bool MarkEOLs) {
+  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
+  auto OnEOL = [&]() {
+    if (MarkEOLs)
+      NewArgv.push_back(nullptr);
+  };
+  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
+                                 /*AlwaysCopy=*/true, OnEOL, true);
 }
 
 void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,

diff  --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index b1af298d9e83e..e4156747ca686 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -247,7 +247,7 @@ windows::GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
 
   SmallVector<const char *, 20> TmpArgs;
   StringSaver Saver(Alloc);
-  cl::TokenizeWindowsCommandLine(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
+  cl::TokenizeWindowsCommandLineFull(Cmd, Saver, TmpArgs, /*MarkEOLs=*/false);
 
   for (const char *Arg : TmpArgs) {
     EC = WildcardExpand(Arg, Args, Saver);

diff  --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp
index 7f751e5e101bd..ec9901aa9501b 100644
--- a/llvm/unittests/Support/CommandLineTest.cpp
+++ b/llvm/unittests/Support/CommandLineTest.cpp
@@ -257,6 +257,24 @@ TEST(CommandLineTest, TokenizeWindowsCommandLineQuotedLastArgument) {
   testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input3, Output3);
 }
 
+TEST(CommandLineTest, TokenizeWindowsCommandLineExeName) {
+  const char Input1[] =
+      R"("C:\Program Files\Whatever\"clang.exe z.c -DY=\"x\")";
+  const char *const Output1[] = {"C:\\Program Files\\Whatever\\clang.exe",
+                                 "z.c", "-DY=\"x\""};
+  testCommandLineTokenizer(cl::TokenizeWindowsCommandLineFull, Input1, Output1);
+
+  const char Input2[] = "\"a\\\"b c\\\"d\n\"e\\\"f g\\\"h\n";
+  const char *const Output2[] = {"a\\b", "c\"d", nullptr,
+                                 "e\\f", "g\"h", nullptr};
+  testCommandLineTokenizer(cl::TokenizeWindowsCommandLineFull, Input2, Output2,
+                           /*MarkEOLs=*/true);
+
+  const char Input3[] = R"(\\server\share\subdir\clang.exe)";
+  const char *const Output3[] = {"\\\\server\\share\\subdir\\clang.exe"};
+  testCommandLineTokenizer(cl::TokenizeWindowsCommandLineFull, Input3, Output3);
+}
+
 TEST(CommandLineTest, TokenizeAndMarkEOLs) {
   // Clang uses EOL marking in response files to support options that consume
   // the rest of the arguments on the current line, but do not consume arguments


        


More information about the llvm-commits mailing list