[clang] 370bee4 - [clang-format] Fix whitespace counting stuff

Sat Jun 25 19:21:41 PDT 2022

Author: sstwcw
Date: 2022-06-26T01:27:27Z
New Revision: 370bee480139bd37fe8c0c8c03ecd19ed9223f01

URL: https://github.com/llvm/llvm-project/commit/370bee480139bd37fe8c0c8c03ecd19ed9223f01
DIFF: https://github.com/llvm/llvm-project/commit/370bee480139bd37fe8c0c8c03ecd19ed9223f01.diff

LOG: [clang-format] Fix whitespace counting stuff

The current way of counting whitespace would count backticks as
whitespace.  For Verilog stuff we need backticks to be handled
correctly.  For JavaScript the current way is to compare the entire
token text to see if it's a backtick.  However, when the backtick is the
first token following an escaped newline, the escaped newline will be
part of the tok::unknown token.  Verilog has macros and escaped newlines
unlike JavaScript.  So we can't regard an entire tok::unknown token as
whitespace.  Previously, the start of every token would be matched for
newlines.  Now, it is all whitespace instead of just newlines.

The column counting problem has already been fixed for JavaScript in
e71b4cbdd140f059667f84464bd0ac0ebc348387 by counting columns elsewhere.

Reviewed By: HazardyKnusperkeks

Differential Revision: https://reviews.llvm.org/D124748

Added: 
    

Modified: 
    clang/lib/Format/FormatTokenLexer.cpp
    clang/lib/Format/FormatTokenLexer.h

Removed: 
    


################################################################################
diff  --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 214a52780888b..8aee794b3f4f7 100644

--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -840,6 +840,56 @@ FormatToken *FormatTokenLexer::getStashedToken() {
   return FormatTok;
 }
 
+/// Truncate the current token to the new length and make the lexer continue
+/// from the end of the truncated token. Used for other languages that have
+/// 
diff erent token boundaries, like JavaScript in which a comment ends at a
+/// line break regardless of whether the line break follows a backslash. Also
+/// used to set the lexer to the end of whitespace if the lexer regards
+/// whitespace and an unrecognized symbol as one token.
+void FormatTokenLexer::truncateToken(size_t NewLen) {
+  assert(NewLen <= FormatTok->TokenText.size());
+  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
+      Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
+  FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
+  FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
+      FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
+      Encoding);
+  FormatTok->Tok.setLength(NewLen);
+}
+
+/// Count the length of leading whitespace in a token.
+static size_t countLeadingWhitespace(StringRef Text) {
+  // Basically counting the length matched by this regex.
+  // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
+  // Directly using the regex turned out to be slow. With the regex
+  // version formatting all files in this directory took about 1.25
+  // seconds. This version took about 0.5 seconds.
+  const char *Cur = Text.begin();
+  while (Cur < Text.end()) {
+    if (isspace(Cur[0])) {
+      ++Cur;
+    } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
+      // A '\' followed by a newline always escapes the newline, regardless
+      // of whether there is another '\' before it.
+      // The source has a null byte at the end. So the end of the entire input
+      // isn't reached yet. Also the lexer doesn't break apart an escaped
+      // newline.
+      assert(Text.end() - Cur >= 2);
+      Cur += 2;
+    } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
+               (Cur[3] == '\n' || Cur[3] == '\r')) {
+      // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
+      // characters are quoted individually in this comment because if we write
+      // them together some compilers warn that we have a trigraph in the code.
+      assert(Text.end() - Cur >= 4);
+      Cur += 4;
+    } else {
+      break;
+    }
+  }
+  return Cur - Text.begin();
+}
+
 FormatToken *FormatTokenLexer::getNextToken() {
   if (StateStack.top() == LexerState::TOKEN_STASHED) {
     StateStack.pop();
@@ -854,34 +904,33 @@ FormatToken *FormatTokenLexer::getNextToken() {
   IsFirstToken = false;
 
   // Consume and record whitespace until we find a significant token.
+  // Some tok::unknown tokens are not just whitespace, e.g. whitespace
+  // followed by a symbol such as backtick. Those symbols may be
+  // significant in other languages.
   unsigned WhitespaceLength = TrailingWhitespace;
-  while (FormatTok->is(tok::unknown)) {
+  while (FormatTok->isNot(tok::eof)) {
+    auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
+    if (LeadingWhitespace == 0)
+      break;
+    if (LeadingWhitespace < FormatTok->TokenText.size())
+      truncateToken(LeadingWhitespace);
     StringRef Text = FormatTok->TokenText;
-    auto EscapesNewline = [&](int pos) {
-      // A '\r' here is just part of '\r\n'. Skip it.
-      if (pos >= 0 && Text[pos] == '\r')
-        --pos;
-      // See whether there is an odd number of '\' before this.
-      // FIXME: This is wrong. A '\' followed by a newline is always removed,
-      // regardless of whether there is another '\' before it.
-      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
-      unsigned count = 0;
-      for (; pos >= 0; --pos, ++count)
-        if (Text[pos] != '\\')
-          break;
-      return count & 1;
-    };
-    // FIXME: This miscounts tok:unknown tokens that are not just
-    // whitespace, e.g. a '`' character.
+    bool InEscape = false;
     for (int i = 0, e = Text.size(); i != e; ++i) {
       switch (Text[i]) {
+      case '\r':
+        // If this is a CRLF sequence, break here and the LF will be handled on
+        // the next loop iteration. Otherwise, this is a single Mac CR, treat it
+        // the same as a single LF.
+        if (i + 1 < e && Text[i + 1] == '\n')
+          break;
+        LLVM_FALLTHROUGH;
       case '\n':
         ++FormatTok->NewlinesBefore;
-        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
-        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
-        Column = 0;
-        break;
-      case '\r':
+        if (!InEscape)
+          FormatTok->HasUnescapedNewline = true;
+        else
+          InEscape = false;
         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
         Column = 0;
         break;
@@ -897,24 +946,32 @@ FormatToken *FormatTokenLexer::getNextToken() {
             Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
         break;
       case '\\':
-        if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
-          FormatTok->setType(TT_ImplicitStringLiteral);
+      case '?':
+      case '/':
+        // The text was entirely whitespace when this loop was entered. Thus
+        // this has to be an escape sequence.
+        assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
+               Text.substr(i, 4) == "\?\?/\r" ||
+               Text.substr(i, 4) == "\?\?/\n" ||
+               (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
+                           Text.substr(i - 1, 4) == "\?\?/\n")) ||
+               (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
+                           Text.substr(i - 2, 4) == "\?\?/\n")));
+        InEscape = true;
         break;
       default:
-        FormatTok->setType(TT_ImplicitStringLiteral);
+        // This shouldn't happen.
+        assert(false);
         break;
       }
-      if (FormatTok->getType() == TT_ImplicitStringLiteral)
-        break;
     }
-
-    if (FormatTok->is(TT_ImplicitStringLiteral))
-      break;
-    WhitespaceLength += FormatTok->Tok.getLength();
-
+    WhitespaceLength += Text.size();
     readRawToken(*FormatTok);
   }
 
+  if (FormatTok->is(tok::unknown))
+    FormatTok->setType(TT_ImplicitStringLiteral);
+
   // JavaScript and Java do not allow to escape the end of the line with a
   // backslash. Backslashes are syntax errors in plain source, but can occur in
   // comments. When a single line comment ends with a \, it'll cause the next
@@ -928,42 +985,13 @@ FormatToken *FormatTokenLexer::getNextToken() {
     while (BackslashPos != StringRef::npos) {
       if (BackslashPos + 1 < FormatTok->TokenText.size() &&
           FormatTok->TokenText[BackslashPos + 1] == '\n') {
-        const char *Offset = Lex->getBufferLocation();
-        Offset -= FormatTok->TokenText.size();
-        Offset += BackslashPos + 1;
-        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
-        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
-        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
-            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
-            Encoding);
+        truncateToken(BackslashPos + 1);
         break;
       }
       BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
     }
   }
 
-  // In case the token starts with escaped newlines, we want to
-  // take them into account as whitespace - this pattern is quite frequent
-  // in macro definitions.
-  // FIXME: Add a more explicit test.
-  while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
-    unsigned SkippedWhitespace = 0;
-    if (FormatTok->TokenText.size() > 2 &&
-        (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n')) {
-      SkippedWhitespace = 3;
-    } else if (FormatTok->TokenText[1] == '\n') {
-      SkippedWhitespace = 2;
-    } else {
-      break;
-    }
-
-    ++FormatTok->NewlinesBefore;
-    WhitespaceLength += SkippedWhitespace;
-    FormatTok->LastNewlineOffset = SkippedWhitespace;
-    Column = 0;
-    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
-  }
-
   FormatTok->WhitespaceRange = SourceRange(
       WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
 

diff  --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 3e7bad7b71513..4856d29a0f9cc 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -92,6 +92,8 @@ class FormatTokenLexer {
 
   bool tryMergeConflictMarkers();
 
+  void truncateToken(size_t NewLen);
+
   FormatToken *getStashedToken();
 
   FormatToken *getNextToken();