[lld] 270d3fa - [COFF] Add and use a zero-copy tokenizer for .drectve

Reid Kleckner via llvm-commits llvm-commits at lists.llvm.org
Sat May 2 10:47:09 PDT 2020


Author: Reid Kleckner
Date: 2020-05-02T10:47:02-07:00
New Revision: 270d3faf6e0d09ec00ba51b46241534bc6455256

URL: https://github.com/llvm/llvm-project/commit/270d3faf6e0d09ec00ba51b46241534bc6455256
DIFF: https://github.com/llvm/llvm-project/commit/270d3faf6e0d09ec00ba51b46241534bc6455256.diff

LOG: [COFF] Add and use a zero-copy tokenizer for .drectve

This generalizes the main Windows command line tokenizer to be able to
produce StringRef substrings as well as freshly copied C strings. The
implementation is still shared with the normal tokenizer, which is
important, because we have unit tests for that.

.drective sections can be very long. They can potentially list up to
every symbol in the object file by name. It is worth avoiding these
string copies.

This saves a lot of memory when linking chrome.dll with PGO
instrumentation:

             BEFORE      AFTER      % IMP
peak memory: 6657.76MB   4983.54MB  -25%
real:        4m30.875s   2m26.250s  -46%

The time improvement may not be real, my machine was noisy while running
this, but that the peak memory usage improvement should be real.

This change may also help apps that heavily use dllexport annotations,
because those also use linker directives in object files. Apps that do
not use many directives are unlikely to be affected.

Reviewed By: thakis

Differential Revision: https://reviews.llvm.org/D79262

Added: 
    

Modified: 
    lld/COFF/DriverUtils.cpp
    llvm/include/llvm/Support/CommandLine.h
    llvm/lib/Support/CommandLine.cpp

Removed: 
    


################################################################################
diff  --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp
index 5c6210a0c27a..daaddad1239d 100644
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@@ -861,19 +861,25 @@ opt::InputArgList ArgParser::parse(ArrayRef<const char *> argv) {
 }
 
 // Tokenizes and parses a given string as command line in .drective section.
-// /EXPORT options are processed in fastpath.
 ParsedDirectives ArgParser::parseDirectives(StringRef s) {
   ParsedDirectives result;
   SmallVector<const char *, 16> rest;
 
-  for (StringRef tok : tokenize(s)) {
+  // Handle /EXPORT and /INCLUDE in a fast path. These directives can appear for
+  // potentially every symbol in the object, so they must be handled quickly.
+  SmallVector<StringRef, 16> tokens;
+  cl::TokenizeWindowsCommandLineNoCopy(s, saver, tokens);
+  for (StringRef tok : tokens) {
     if (tok.startswith_lower("/export:") || tok.startswith_lower("-export:"))
       result.exports.push_back(tok.substr(strlen("/export:")));
     else if (tok.startswith_lower("/include:") ||
              tok.startswith_lower("-include:"))
       result.includes.push_back(tok.substr(strlen("/include:")));
-    else
-      rest.push_back(tok.data());
+    else {
+      // Save non-null-terminated strings to make proper C strings.
+      bool HasNul = tok.data()[tok.size()] == '\0';
+      rest.push_back(HasNul ? tok.data() : saver.save(tok).data());
+    }
   }
 
   // Make InputArgList from unparsed string vectors.

diff  --git a/llvm/include/llvm/Support/CommandLine.h b/llvm/include/llvm/Support/CommandLine.h
index 2c87d8fbbaad..9a75cbc4e90d 100644
--- a/llvm/include/llvm/Support/CommandLine.h
+++ b/llvm/include/llvm/Support/CommandLine.h
@@ -2027,6 +2027,13 @@ void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver,
                                 SmallVectorImpl<const char *> &NewArgv,
                                 bool MarkEOLs = false);
 
+/// Tokenizes a Windows command line while attempting to avoid copies. If no
+/// quoting or escaping was used, this produces substrings of the original
+/// string. If a token requires unquoting, it will be allocated with the
+/// StringSaver.
+void TokenizeWindowsCommandLineNoCopy(StringRef Source, StringSaver &Saver,
+                                      SmallVectorImpl<StringRef> &NewArgv);
+
 /// String tokenization function type.  Should be compatible with either
 /// Windows or Unix command line tokenizers.
 using TokenizerCallback = void (*)(StringRef Source, StringSaver &Saver,

diff  --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index 0025806ca235..aa7e79652af9 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -919,91 +919,118 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
   return I - 1;
 }
 
-void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
-                                    SmallVectorImpl<const char *> &NewArgv,
-                                    bool MarkEOLs) {
+// Windows treats whitespace, double quotes, and backslashes specially.
+static bool isWindowsSpecialChar(char C) {
+  return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
+}
+
+// Windows tokenization implementation. The implementation is designed to be
+// inlined and specialized for the two user entry points.
+static inline void
+tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
+                               function_ref<void(StringRef)> AddToken,
+                               bool AlwaysCopy, function_ref<void()> MarkEOL) {
   SmallString<128> Token;
 
-  // This is a small state machine to consume characters until it reaches the
-  // end of the source string.
+  // Try to do as much work inside the state machine as possible.
   enum { INIT, UNQUOTED, QUOTED } State = INIT;
-  for (size_t I = 0, E = Src.size(); I != E; ++I) {
-    char C = Src[I];
-
-    // INIT state indicates that the current input index is at the start of
-    // the string or between tokens.
-    if (State == INIT) {
-      if (isWhitespaceOrNull(C)) {
-        // Mark the end of lines in response files
-        if (MarkEOLs && C == '\n')
-          NewArgv.push_back(nullptr);
-        continue;
+  for (size_t I = 0, E = Src.size(); I < E; ++I) {
+    switch (State) {
+    case INIT: {
+      assert(Token.empty() && "token should be empty in initial state");
+      // Eat whitespace before a token.
+      while (I < E && isWhitespaceOrNull(Src[I])) {
+        if (Src[I] == '\n')
+          MarkEOL();
+        ++I;
       }
-      if (C == '"') {
+      // Stop if this was trailing whitespace.
+      if (I >= E)
+        break;
+      size_t Start = I;
+      while (I < E && !isWindowsSpecialChar(Src[I]))
+        ++I;
+      StringRef NormalChars = Src.slice(Start, I);
+      if (I >= E || isWhitespaceOrNull(Src[I])) {
+        if (I < E && Src[I] == '\n')
+          MarkEOL();
+        // No special characters: slice out the substring and start the next
+        // token. Copy the string if the caller asks us to.
+        AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
+      } else if (Src[I] == '\"') {
+        Token += NormalChars;
         State = QUOTED;
-        continue;
-      }
-      if (C == '\\') {
+      } else if (Src[I] == '\\') {
+        Token += NormalChars;
         I = parseBackslash(Src, I, Token);
         State = UNQUOTED;
-        continue;
+      } else {
+        llvm_unreachable("unexpected special character");
       }
-      Token.push_back(C);
-      State = UNQUOTED;
-      continue;
+      break;
     }
 
-    // UNQUOTED state means that it's reading a token not quoted by double
-    // quotes.
-    if (State == UNQUOTED) {
-      // Whitespace means the end of the token.
-      if (isWhitespaceOrNull(C)) {
-        NewArgv.push_back(Saver.save(StringRef(Token)).data());
+    case UNQUOTED:
+      if (isWhitespaceOrNull(Src[I])) {
+        // Whitespace means the end of the token. If we are in this state, the
+        // token must have contained a special character, so we must copy the
+        // token.
+        AddToken(Saver.save(Token.str()));
         Token.clear();
+        if (Src[I] == '\n')
+          MarkEOL();
         State = INIT;
-        // Mark the end of lines in response files
-        if (MarkEOLs && C == '\n')
-          NewArgv.push_back(nullptr);
-        continue;
-      }
-      if (C == '"') {
+      } else if (Src[I] == '\"') {
         State = QUOTED;
-        continue;
-      }
-      if (C == '\\') {
+      } else if (Src[I] == '\\') {
         I = parseBackslash(Src, I, Token);
-        continue;
+      } else {
+        Token.push_back(Src[I]);
       }
-      Token.push_back(C);
-      continue;
-    }
+      break;
 
-    // QUOTED state means that it's reading a token quoted by double quotes.
-    if (State == QUOTED) {
-      if (C == '"') {
+    case QUOTED:
+      if (Src[I] == '\"') {
         if (I < (E - 1) && Src[I + 1] == '"') {
           // Consecutive double-quotes inside a quoted string implies one
           // double-quote.
           Token.push_back('"');
-          I = I + 1;
-          continue;
+          ++I;
+        } else {
+          // Otherwise, end the quoted portion and return to the unquoted state.
+          State = UNQUOTED;
         }
-        State = UNQUOTED;
-        continue;
-      }
-      if (C == '\\') {
+      } else if (Src[I] == '\\') {
         I = parseBackslash(Src, I, Token);
-        continue;
+      } else {
+        Token.push_back(Src[I]);
       }
-      Token.push_back(C);
+      break;
     }
   }
-  // Append the last token after hitting EOF with no whitespace.
+
   if (!Token.empty())
-    NewArgv.push_back(Saver.save(StringRef(Token)).data());
-  // Mark the end of response files
-  if (MarkEOLs)
-    NewArgv.push_back(nullptr);
+    AddToken(Saver.save(Token.str()));
+}
+
+void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
+                                    SmallVectorImpl<const char *> &NewArgv,
+                                    bool MarkEOLs) {
+  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
+  auto OnEOL = [&]() {
+    if (MarkEOLs)
+      NewArgv.push_back(nullptr);
+  };
+  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
+                                 /*AlwaysCopy=*/true, OnEOL);
+}
+
+void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
+                                          SmallVectorImpl<StringRef> &NewArgv) {
+  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
+  auto OnEOL = []() {};
+  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
+                                 OnEOL);
 }
 
 void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,


        


More information about the llvm-commits mailing list