[clang] f9f0919 - [clang-format] Improve support for multiline C# strings

Jonathan Coe via cfe-commits cfe-commits at lists.llvm.org
Thu Jan 30 05:46:12 PST 2020


Author: Jonathan Coe
Date: 2020-01-30T13:45:48Z
New Revision: f9f0919db7ea033a205c87eb08c81c4baaecd846

URL: https://github.com/llvm/llvm-project/commit/f9f0919db7ea033a205c87eb08c81c4baaecd846
DIFF: https://github.com/llvm/llvm-project/commit/f9f0919db7ea033a205c87eb08c81c4baaecd846.diff

LOG: [clang-format] Improve support for multiline C# strings

Reviewers: krasimir

Reviewed By: krasimir

Tags: #clang-format

Differential Revision: https://reviews.llvm.org/D73622

Added: 
    

Modified: 
    clang/lib/Format/ContinuationIndenter.cpp
    clang/lib/Format/FormatTokenLexer.cpp
    clang/lib/Format/FormatTokenLexer.h
    clang/unittests/Format/FormatTestCSharp.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index ec2de35ca0d2..1ea7eb031d36 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1760,7 +1760,7 @@ ContinuationIndenter::createBreakableToken(const FormatToken &Current,
                                            LineState &State, bool AllowBreak) {
   unsigned StartColumn = State.Column - Current.ColumnWidth;
   if (Current.isStringLiteral()) {
-    // FIXME: String literal breaking is currently disabled for C#,Java and
+    // FIXME: String literal breaking is currently disabled for C#, Java and
     // JavaScript, as it requires strings to be merged using "+" which we
     // don't support.
     if (Style.Language == FormatStyle::LK_Java ||

diff  --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index ba0bbf68f12f..98650951c7d0 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -57,6 +57,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
     if (Style.Language == FormatStyle::LK_TextProto)
       tryParsePythonComment();
     tryMergePreviousTokens();
+    if (Style.isCSharp())
+      // This needs to come after tokens have been merged so that C#
+      // string literals are correctly identified.
+      handleCSharpVerbatimAndInterpolatedStrings();
     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
       FirstInLineIndex = Tokens.size() - 1;
   } while (Tokens.back()->Tok.isNot(tok::eof));
@@ -181,12 +185,12 @@ bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
 // Search for verbatim or interpolated string literals @"ABC" or
 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
 // prevent splitting of @, $ and ".
+// Merging of multiline verbatim strings with embedded '"' is handled in
+// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
   if (Tokens.size() < 2)
     return false;
 
-  auto &CSharpStringLiteral = *(Tokens.end() - 2);
-
   // Interpolated strings could contain { } with " characters inside.
   // $"{x ?? "null"}"
   // should not be split into $"{x ?? ", null, "}" but should treated as a
@@ -236,27 +240,12 @@ bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
     }
   }
 
-  // verbatim strings could contain "" which C# sees as an escaped ".
-  // @"""Hello""" will have been tokenized as @"" "Hello" "" and needs
-  // merging into a single string literal.
+  // Look for @"aaaaaa" or $"aaaaaa".
   auto &String = *(Tokens.end() - 1);
   if (!String->is(tok::string_literal))
     return false;
 
-  if (CSharpStringLiteral->Type == TT_CSharpStringLiteral &&
-      (CSharpStringLiteral->TokenText.startswith(R"(@")") ||
-       CSharpStringLiteral->TokenText.startswith(R"($@")"))) {
-    CSharpStringLiteral->TokenText = StringRef(
-        CSharpStringLiteral->TokenText.begin(),
-        String->TokenText.end() - CSharpStringLiteral->TokenText.begin());
-    CSharpStringLiteral->ColumnWidth += String->ColumnWidth;
-    Tokens.erase(Tokens.end() - 1);
-    return true;
-  }
-
   auto &At = *(Tokens.end() - 2);
-
-  // Look for @"aaaaaa" or $"aaaaaa".
   if (!(At->is(tok::at) || At->TokenText == "$"))
     return false;
 
@@ -498,6 +487,68 @@ void FormatTokenLexer::tryParseJSRegexLiteral() {
   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
 }
 
+void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
+  FormatToken *CSharpStringLiteral = Tokens.back();
+
+  if (CSharpStringLiteral->Type != TT_CSharpStringLiteral)
+    return;
+
+  // Deal with multiline strings.
+  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
+        CSharpStringLiteral->TokenText.startswith(R"($@")")))
+    return;
+
+  const char *StrBegin =
+      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
+  const char *Offset = StrBegin;
+  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
+    Offset += 2;
+  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
+    Offset += 3;
+
+  // Look for a terminating '"' in the current file buffer.
+  // Make no effort to format code within an interpolated or verbatim string.
+  for (; Offset != Lex->getBuffer().end(); ++Offset) {
+    if (Offset[0] == '"') {
+      // "" within a verbatim string is an escaped double quote: skip it.
+      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
+        ++Offset;
+      else
+        break;
+    }
+  }
+
+  // Make no attempt to format code properly if a verbatim string is
+  // unterminated.
+  if (Offset == Lex->getBuffer().end())
+    return;
+
+  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
+  CSharpStringLiteral->TokenText = LiteralText;
+
+  // Adjust width for potentially multiline string literals.
+  size_t FirstBreak = LiteralText.find('\n');
+  StringRef FirstLineText = FirstBreak == StringRef::npos
+                                ? LiteralText
+                                : LiteralText.substr(0, FirstBreak);
+  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
+      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
+      Encoding);
+  size_t LastBreak = LiteralText.rfind('\n');
+  if (LastBreak != StringRef::npos) {
+    CSharpStringLiteral->IsMultiline = true;
+    unsigned StartColumn = 0; // The template tail spans the entire line.
+    CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
+        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
+        Style.TabWidth, Encoding);
+  }
+
+  SourceLocation loc = Offset < Lex->getBuffer().end()
+                           ? Lex->getSourceLocation(Offset + 1)
+                           : SourceMgr.getLocForEndOfFile(ID);
+  resetLexer(SourceMgr.getFileOffset(loc));
+}
+
 void FormatTokenLexer::handleTemplateStrings() {
   FormatToken *BacktickToken = Tokens.back();
 

diff  --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 053b759d2440..be13ac8f6735 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -79,6 +79,8 @@ class FormatTokenLexer {
   // nested template parts by balancing curly braces.
   void handleTemplateStrings();
 
+  void handleCSharpVerbatimAndInterpolatedStrings();
+
   void tryParsePythonComment();
 
   bool tryMerge_TMacro();

diff  --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp
index 222745f219f0..3d1b597174d8 100644
--- a/clang/unittests/Format/FormatTestCSharp.cpp
+++ b/clang/unittests/Format/FormatTestCSharp.cpp
@@ -412,9 +412,9 @@ TEST_F(FormatTestCSharp, CSharpSpaceAfterCStyleCast) {
 TEST_F(FormatTestCSharp, CSharpEscapedQuotesInVerbatimStrings) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_CSharp);
 
-  verifyFormat(R"(string str = @"""")", Style);
-  verifyFormat(R"(string str = @"""Hello world""")", Style);
-  verifyFormat(R"(string str = $@"""Hello {friend}""")", Style);
+  verifyFormat(R"(string str = @"""";)", Style);
+  verifyFormat(R"(string str = @"""Hello world""";)", Style);
+  verifyFormat(R"(string str = $@"""Hello {friend}""";)", Style);
 }
 
 TEST_F(FormatTestCSharp, CSharpQuotesInInterpolatedStrings) {
@@ -425,5 +425,37 @@ TEST_F(FormatTestCSharp, CSharpQuotesInInterpolatedStrings) {
   verifyFormat(R"(string str3 = $"{braceCount}}} braces";)", Style);
 }
 
+TEST_F(FormatTestCSharp, CSharpNewlinesInVerbatimStrings) {
+  // Use MS style as Google Style inserts a line break before multiline strings.
+
+  // verifyFormat does not understand multiline C# string-literals
+  // so check the format explicitly.
+
+  FormatStyle Style = getMicrosoftStyle(FormatStyle::LK_CSharp);
+
+  std::string Code = R"(string s1 = $@"some code:
+  class {className} {{
+    {className}() {{}}
+  }}";)";
+
+  EXPECT_EQ(Code, format(Code, Style));
+
+  // Multiline string in the middle of a function call.
+  Code = R"(
+var x = foo(className, $@"some code:
+  class {className} {{
+    {className}() {{}}
+  }}",
+            y);)"; // y aligned with `className` arg.
+
+  EXPECT_EQ(Code, format(Code, Style));
+
+  // Interpolated string with embedded multiline string.
+  Code = R"(Console.WriteLine($"{string.Join(@",
+		", values)}");)";
+
+  EXPECT_EQ(Code, format(Code, Style));
+}
+
 } // namespace format
 } // end namespace clang


        


More information about the cfe-commits mailing list