r250648 - clang-format: [JS] Handle string literals spanning character classes.

Sun Oct 18 00:02:29 PDT 2015

Author: djasper
Date: Sun Oct 18 02:02:28 2015
New Revision: 250648

URL: http://llvm.org/viewvc/llvm-project?rev=250648&view=rev
Log:
clang-format: [JS] Handle string literals spanning character classes.

If a RegExp contains a character group with a quote (/["]/), the
trailing end of it is first tokenized as a string literal, which leads
to the merging code seeing an unbalanced bracket.

This change parses regex literals from the left hand side. That
simplifies the parsing code and also allows correctly handling escapes
and character classes, hopefully correctly parsing all regex literals.

Patch by Martin Probst, thank you.
Review: http://reviews.llvm.org/D13765

Modified:
    cfe/trunk/lib/Format/Format.cpp
    cfe/trunk/unittests/Format/FormatTestJS.cpp

Modified: cfe/trunk/lib/Format/Format.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Format/Format.cpp?rev=250648&r1=250647&r2=250648&view=diff
==============================================================================

--- cfe/trunk/lib/Format/Format.cpp (original)
+++ cfe/trunk/lib/Format/Format.cpp Sun Oct 18 02:02:28 2015
@@ -732,6 +732,8 @@ public:
     assert(FirstInLineIndex == 0);
     do {
       Tokens.push_back(getNextToken());
+      if (Style.Language == FormatStyle::LK_JavaScript)
+        tryParseJSRegexLiteral();
       tryMergePreviousTokens();
       if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
         FirstInLineIndex = Tokens.size() - 1;
@@ -751,10 +753,6 @@ private:
       return;
 
     if (Style.Language == FormatStyle::LK_JavaScript) {
-      if (tryMergeJSRegexLiteral())
-        return;
-      if (tryMergeEscapeSequence())
-        return;
       if (tryMergeTemplateString())
         return;
 
@@ -826,107 +824,97 @@ private:
     return true;
   }
 
-  // Tries to merge an escape sequence, i.e. a "\\" and the following
-  // character. Use e.g. inside JavaScript regex literals.
-  bool tryMergeEscapeSequence() {
-    if (Tokens.size() < 2)
-      return false;
-    FormatToken *Previous = Tokens[Tokens.size() - 2];
-    if (Previous->isNot(tok::unknown) || Previous->TokenText != "\\")
-      return false;
-    ++Previous->ColumnWidth;
-    StringRef Text = Previous->TokenText;
-    Previous->TokenText = StringRef(Text.data(), Text.size() + 1);
-    resetLexer(SourceMgr.getFileOffset(Tokens.back()->Tok.getLocation()) + 1);
-    Tokens.resize(Tokens.size() - 1);
-    Column = Previous->OriginalColumn + Previous->ColumnWidth;
+  // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
+  bool precedesOperand(FormatToken *Tok) {
+    // NB: This is not entirely correct, as an r_paren can introduce an operand
+    // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
+    // corner case to not matter in practice, though.
+    return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
+                        tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
+                        tok::colon, tok::question, tok::tilde) ||
+           Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
+                        tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
+                        tok::kw_typeof, Keywords.kw_instanceof,
+                        Keywords.kw_in) ||
+           Tok->isBinaryOperator();
+  }
+
+  bool canPrecedeRegexLiteral(FormatToken *Prev) {
+    if (!Prev)
+      return true;
+
+    // Regex literals can only follow after prefix unary operators, not after
+    // postfix unary operators. If the '++' is followed by a non-operand
+    // introducing token, the slash here is the operand and not the start of a
+    // regex.
+    if (Prev->isOneOf(tok::plusplus, tok::minusminus))
+      return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
+
+    // The previous token must introduce an operand location where regex
+    // literals can occur.
+    if (!precedesOperand(Prev))
+      return false;
+
     return true;
   }
 
-  // Try to determine whether the current token ends a JavaScript regex literal.
-  // We heuristically assume that this is a regex literal if we find two
-  // unescaped slashes on a line and the token before the first slash is one of
-  // "(;,{}![:?", a binary operator or 'return', as those cannot be followed by
-  // a division.
-  bool tryMergeJSRegexLiteral() {
-    if (Tokens.size() < 2)
-      return false;
-
-    // If this is a string literal with a slash inside, compute the slash's
-    // offset and try to find the beginning of the regex literal.
-    // Also look at tok::unknown, as it can be an unterminated char literal.
-    size_t SlashInStringPos = StringRef::npos;
-    if (Tokens.back()->isOneOf(tok::string_literal, tok::char_constant,
-                               tok::unknown)) {
-      // Start search from position 1 as otherwise, this is an unknown token
-      // for an unterminated /*-comment which is handled elsewhere.
-      SlashInStringPos = Tokens.back()->TokenText.find('/', 1);
-      if (SlashInStringPos == StringRef::npos)
-        return false;
+  // Tries to parse a JavaScript Regex literal starting at the current token,
+  // if that begins with a slash and is in a location where JavaScript allows
+  // regex literals. Changes the current token to a regex literal and updates
+  // its text if successful.
+  void tryParseJSRegexLiteral() {
+    FormatToken *RegexToken = Tokens.back();
+    if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
+      return;
+
+    FormatToken *Prev = nullptr;
+    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
+      // NB: Because previous pointers are not initialized yet, this cannot use
+      // Token.getPreviousNonComment.
+      if ((*I)->isNot(tok::comment)) {
+        Prev = *I;
+        break;
+      }
     }
 
-    // If a regex literal ends in "\//", this gets represented by an unknown
-    // token "\" and a comment.
-    bool MightEndWithEscapedSlash =
-        Tokens.back()->is(tok::comment) &&
-        Tokens.back()->TokenText.startswith("//") &&
-        Tokens[Tokens.size() - 2]->TokenText == "\\";
-    if (!MightEndWithEscapedSlash && SlashInStringPos == StringRef::npos &&
-        (Tokens.back()->isNot(tok::slash) ||
-         (Tokens[Tokens.size() - 2]->is(tok::unknown) &&
-          Tokens[Tokens.size() - 2]->TokenText == "\\")))
-      return false;
+    if (!canPrecedeRegexLiteral(Prev))
+      return;
 
-    unsigned TokenCount = 0;
+    // 'Manually' lex ahead in the current file buffer.
+    const char *Offset = Lex->getBufferLocation();
+    const char *RegexBegin = Offset - RegexToken->TokenText.size();
+    StringRef Buffer = Lex->getBuffer();
     bool InCharacterClass = false;
-    for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
-      ++TokenCount;
-      auto Prev = I + 1;
-      while (Prev != E && Prev[0]->is(tok::comment))
-        ++Prev;
-      // Slashes in character classes (delimited by [ and ]) do not need
-      // escaping. Escaping of the squares themselves is already handled by
-      // \c tryMergeEscapeSequence(), a plain tok::r_square must be non-escaped.
-      if (I[0]->is(tok::r_square))
+    bool HaveClosingSlash = false;
+    for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
+      // Regular expressions are terminated with a '/', which can only be
+      // escaped using '\' or a character class between '[' and ']'.
+      // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
+      switch (*Offset) {
+      case '\\':
+        // Skip the escaped character.
+        ++Offset;
+        break;
+      case '[':
         InCharacterClass = true;
-      if (I[0]->is(tok::l_square)) {
-        if (!InCharacterClass)
-          return false;
+        break;
+      case ']':
         InCharacterClass = false;
+        break;
+      case '/':
+        if (!InCharacterClass)
+          HaveClosingSlash = true;
+        break;
       }
-      if (!InCharacterClass && I[0]->isOneOf(tok::slash, tok::slashequal) &&
-          (Prev == E ||
-           ((Prev[0]->isOneOf(tok::l_paren, tok::semi, tok::l_brace,
-                              tok::r_brace, tok::exclaim, tok::l_square,
-                              tok::colon, tok::comma, tok::question,
-                              tok::kw_return) ||
-             Prev[0]->isBinaryOperator())))) {
-        unsigned LastColumn = Tokens.back()->OriginalColumn;
-        SourceLocation Loc = Tokens.back()->Tok.getLocation();
-        if (MightEndWithEscapedSlash) {
-          // This regex literal ends in '\//'. Skip past the '//' of the last
-          // token and re-start lexing from there.
-          resetLexer(SourceMgr.getFileOffset(Loc) + 2);
-        } else if (SlashInStringPos != StringRef::npos) {
-          // This regex literal ends in a string_literal with a slash inside.
-          // Calculate end column and reset lexer appropriately.
-          resetLexer(SourceMgr.getFileOffset(Loc) + SlashInStringPos + 1);
-          LastColumn += SlashInStringPos;
-        }
-        Tokens.resize(Tokens.size() - TokenCount);
-        Tokens.back()->Tok.setKind(tok::unknown);
-        Tokens.back()->Type = TT_RegexLiteral;
-        // Treat regex literals like other string_literals.
-        Tokens.back()->Tok.setKind(tok::string_literal);
-        Tokens.back()->ColumnWidth += LastColumn - I[0]->OriginalColumn;
-        return true;
-      }
-
-      // There can't be a newline inside a regex literal.
-      if (I[0]->NewlinesBefore > 0)
-        return false;
     }
-    return false;
+
+    RegexToken->Type = TT_RegexLiteral;
+    // Treat regex literals like other string_literals.
+    RegexToken->Tok.setKind(tok::string_literal);
+    RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
+    RegexToken->ColumnWidth = RegexToken->TokenText.size();
+
+    resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
   }
 
   bool tryMergeTemplateString() {

Modified: cfe/trunk/unittests/Format/FormatTestJS.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/Format/FormatTestJS.cpp?rev=250648&r1=250647&r2=250648&view=diff
==============================================================================
--- cfe/trunk/unittests/Format/FormatTestJS.cpp (original)
+++ cfe/trunk/unittests/Format/FormatTestJS.cpp Sun Oct 18 02:02:28 2015
@@ -600,6 +600,13 @@ TEST_F(FormatTestJS, RegexLiteralClassif
 
   // Not regex literals.
   verifyFormat("var a = a / 2 + b / 3;");
+  verifyFormat("var a = a++ / 2;");
+  // Prefix unary can operate on regex literals, not that it makes sense.
+  verifyFormat("var a = ++/a/;");
+
+  // This is a known issue, regular expressions are incorrectly detected if
+  // directly following a closing parenthesis.
+  verifyFormat("if (foo) / bar /.exec(baz);");
 }
 
 TEST_F(FormatTestJS, RegexLiteralSpecialCharacters) {
@@ -625,6 +632,9 @@ TEST_F(FormatTestJS, RegexLiteralSpecial
   verifyFormat("var regex = /[\\/]/;");
   verifyFormat("var regex = /\\[/;");
   verifyFormat("var regex = /\\\\[/]/;");
+  verifyFormat("var regex = /}[\"]/;");
+  verifyFormat("var regex = /}[/\"]/;");
+  verifyFormat("var regex = /}[\"/]/;");
 
   verifyFormat("var regex = /\\b/;");
   verifyFormat("var regex = /\\B/;");