r186454 - clang-format: Improve handling of unterminated string literals.

Tue Jul 16 13:28:34 PDT 2013

Author: djasper
Date: Tue Jul 16 15:28:33 2013
New Revision: 186454

URL: http://llvm.org/viewvc/llvm-project?rev=186454&view=rev
Log:
clang-format: Improve handling of unterminated string literals.

Before, clang-format would simply eat these as they were recognized as
whitespace. With this patch, they are mostly left alone.

Modified:
    cfe/trunk/lib/Format/Format.cpp
    cfe/trunk/lib/Format/FormatToken.h
    cfe/trunk/lib/Format/TokenAnnotator.cpp
    cfe/trunk/unittests/Format/FormatTest.cpp

Modified: cfe/trunk/lib/Format/Format.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Format/Format.cpp?rev=186454&r1=186453&r2=186454&view=diff
==============================================================================

--- cfe/trunk/lib/Format/Format.cpp (original)
+++ cfe/trunk/lib/Format/Format.cpp Tue Jul 16 15:28:33 2013
@@ -905,6 +905,10 @@ private:
       // Only break up default narrow strings.
       if (!Current.TokenText.startswith("\""))
         return 0;
+      // Exempts unterminated string literals from line breaking. The user will
+      // likely want to terminate the string before any line breaking is done.
+      if (Current.IsUnterminatedLiteral)
+         return 0;
 
       Token.reset(new BreakableStringLiteral(Current, StartColumn,
                                              Line.InPPDirective, Encoding));
@@ -1259,8 +1263,7 @@ private:
     }
 
     FormatTok = new (Allocator.Allocate()) FormatToken;
-    Lex.LexFromRawLexer(FormatTok->Tok);
-    StringRef Text = rawTokenText(FormatTok->Tok);
+    readRawToken(*FormatTok);
     SourceLocation WhitespaceStart =
         FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
     if (SourceMgr.getFileOffset(WhitespaceStart) == 0)
@@ -1269,16 +1272,16 @@ private:
     // Consume and record whitespace until we find a significant token.
     unsigned WhitespaceLength = TrailingWhitespace;
     while (FormatTok->Tok.is(tok::unknown)) {
-      unsigned Newlines = Text.count('\n');
+      unsigned Newlines = FormatTok->TokenText.count('\n');
       if (Newlines > 0)
-        FormatTok->LastNewlineOffset = WhitespaceLength + Text.rfind('\n') + 1;
+        FormatTok->LastNewlineOffset =
+            WhitespaceLength + FormatTok->TokenText.rfind('\n') + 1;
       FormatTok->NewlinesBefore += Newlines;
-      unsigned EscapedNewlines = Text.count("\\\n");
+      unsigned EscapedNewlines = FormatTok->TokenText.count("\\\n");
       FormatTok->HasUnescapedNewline |= EscapedNewlines != Newlines;
       WhitespaceLength += FormatTok->Tok.getLength();
 
-      Lex.LexFromRawLexer(FormatTok->Tok);
-      Text = rawTokenText(FormatTok->Tok);
+      readRawToken(*FormatTok);
     }
 
     // In case the token starts with escaped newlines, we want to
@@ -1287,30 +1290,31 @@ private:
     // FIXME: What do we want to do with other escaped spaces, and escaped
     // spaces or newlines in the middle of tokens?
     // FIXME: Add a more explicit test.
-    while (Text.size() > 1 && Text[0] == '\\' && Text[1] == '\n') {
+    while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\' &&
+           FormatTok->TokenText[1] == '\n') {
       // FIXME: ++FormatTok->NewlinesBefore is missing...
       WhitespaceLength += 2;
-      Text = Text.substr(2);
+      FormatTok->TokenText = FormatTok->TokenText.substr(2);
     }
 
     TrailingWhitespace = 0;
     if (FormatTok->Tok.is(tok::comment)) {
-      StringRef UntrimmedText = Text;
-      Text = Text.rtrim();
-      TrailingWhitespace = UntrimmedText.size() - Text.size();
+      StringRef UntrimmedText = FormatTok->TokenText;
+      FormatTok->TokenText = FormatTok->TokenText.rtrim();
+      TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
     } else if (FormatTok->Tok.is(tok::raw_identifier)) {
-      IdentifierInfo &Info = IdentTable.get(Text);
+      IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
       FormatTok->Tok.setIdentifierInfo(&Info);
       FormatTok->Tok.setKind(Info.getTokenID());
     } else if (FormatTok->Tok.is(tok::greatergreater)) {
       FormatTok->Tok.setKind(tok::greater);
-      Text = Text.substr(0, 1);
+      FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
       GreaterStashed = true;
     }
 
     // Now FormatTok is the next non-whitespace token.
-    FormatTok->TokenText = Text;
-    FormatTok->CodePointCount = encoding::getCodePointCount(Text, Encoding);
+    FormatTok->CodePointCount =
+        encoding::getCodePointCount(FormatTok->TokenText, Encoding);
 
     FormatTok->WhitespaceRange = SourceRange(
         WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
@@ -1327,10 +1331,18 @@ private:
   llvm::SpecificBumpPtrAllocator<FormatToken> Allocator;
   SmallVector<FormatToken *, 16> Tokens;
 
-  /// Returns the text of \c FormatTok.
-  StringRef rawTokenText(Token &Tok) {
-    return StringRef(SourceMgr.getCharacterData(Tok.getLocation()),
-                     Tok.getLength());
+  void readRawToken(FormatToken &Tok) {
+    Lex.LexFromRawLexer(Tok.Tok);
+    Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
+                              Tok.Tok.getLength());
+
+    // For formatting, treat unterminated string literals like normal string
+    // literals.
+    if (Tok.is(tok::unknown) && !Tok.TokenText.empty() &&
+        Tok.TokenText[0] == '"') {
+      Tok.Tok.setKind(tok::string_literal);
+      Tok.IsUnterminatedLiteral = true;
+    }
   }
 };
 

Modified: cfe/trunk/lib/Format/FormatToken.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Format/FormatToken.h?rev=186454&r1=186453&r2=186454&view=diff
==============================================================================
--- cfe/trunk/lib/Format/FormatToken.h (original)
+++ cfe/trunk/lib/Format/FormatToken.h Tue Jul 16 15:28:33 2013
@@ -77,11 +77,12 @@ struct FormatToken {
   FormatToken()
       : NewlinesBefore(0), HasUnescapedNewline(false), LastNewlineOffset(0),
         CodePointCount(0), IsFirst(false), MustBreakBefore(false),
-        BlockKind(BK_Unknown), Type(TT_Unknown), SpacesRequiredBefore(0),
-        CanBreakBefore(false), ClosesTemplateDeclaration(false),
-        ParameterCount(0), PackingKind(PPK_Inconclusive), TotalLength(0),
-        UnbreakableTailLength(0), BindingStrength(0), SplitPenalty(0),
-        LongestObjCSelectorName(0), FakeRParens(0), LastInChainOfCalls(false),
+        IsUnterminatedLiteral(false), BlockKind(BK_Unknown), Type(TT_Unknown),
+        SpacesRequiredBefore(0), CanBreakBefore(false),
+        ClosesTemplateDeclaration(false), ParameterCount(0),
+        PackingKind(PPK_Inconclusive), TotalLength(0), UnbreakableTailLength(0),
+        BindingStrength(0), SplitPenalty(0), LongestObjCSelectorName(0),
+        FakeRParens(0), LastInChainOfCalls(false),
         PartOfMultiVariableDeclStmt(false), MatchingParen(NULL), Previous(NULL),
         Next(NULL) {}
 
@@ -133,6 +134,9 @@ struct FormatToken {
   /// escaped newlines.
   StringRef TokenText;
 
+  /// \brief Set to \c true if this token is an unterminated literal.
+  bool IsUnterminatedLiteral;
+
   /// \brief Contains the kind of block if this token is a brace.
   BraceBlockKind BlockKind;
 

Modified: cfe/trunk/lib/Format/TokenAnnotator.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Format/TokenAnnotator.cpp?rev=186454&r1=186453&r2=186454&view=diff
==============================================================================
--- cfe/trunk/lib/Format/TokenAnnotator.cpp (original)
+++ cfe/trunk/lib/Format/TokenAnnotator.cpp Tue Jul 16 15:28:33 2013
@@ -948,13 +948,14 @@ void TokenAnnotator::calculateFormatting
       Current->SpacesRequiredBefore =
           spaceRequiredBefore(Line, *Current) ? 1 : 0;
 
-    if (Current->MustBreakBefore) {
-    } else if (Current->is(tok::comment)) {
+    if (Current->is(tok::comment)) {
       Current->MustBreakBefore = Current->NewlinesBefore > 0;
     } else if (Current->Previous->isTrailingComment() ||
                (Current->is(tok::string_literal) &&
                 Current->Previous->is(tok::string_literal))) {
       Current->MustBreakBefore = true;
+    } else if (Current->Previous->IsUnterminatedLiteral) {
+      Current->MustBreakBefore = true;
     } else if (Current->is(tok::lessless) && Current->Next &&
                Current->Previous->is(tok::string_literal) &&
                Current->Next->is(tok::string_literal)) {
@@ -969,8 +970,6 @@ void TokenAnnotator::calculateFormatting
                Current->getNextNonComment() &&
                Current->getNextNonComment()->is(tok::string_literal)) {
       Current->MustBreakBefore = true;
-    } else {
-      Current->MustBreakBefore = false;
     }
     Current->CanBreakBefore =
         Current->MustBreakBefore || canBreakBefore(Line, *Current);

Modified: cfe/trunk/unittests/Format/FormatTest.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/unittests/Format/FormatTest.cpp?rev=186454&r1=186453&r2=186454&view=diff
==============================================================================
--- cfe/trunk/unittests/Format/FormatTest.cpp (original)
+++ cfe/trunk/unittests/Format/FormatTest.cpp Tue Jul 16 15:28:33 2013
@@ -5004,6 +5004,11 @@ TEST_F(FormatTest, SkipsUnknownStringLit
             format("L\"unsupported literal\";", getGoogleStyleWithColumns(15)));
   EXPECT_EQ("R\"x(raw literal)x\";",
             format("R\"x(raw literal)x\";", getGoogleStyleWithColumns(15)));
+  verifyFormat("string a = \"unterminated;");
+  EXPECT_EQ("function(\"unterminated,\n"
+            "         OtherParameter);",
+            format("function(  \"unterminated,\n"
+                   "    OtherParameter);"));
 }
 
 TEST_F(FormatTest, DoesNotTryToParseUDLiteralsInPreCpp11Code) {