[clang] [clang-format] TableGen multi line string support. (PR #78032)

Hirofumi Nakamura via cfe-commits cfe-commits at lists.llvm.org
Tue Jan 16 04:12:16 PST 2024


https://github.com/hnakamura5 updated https://github.com/llvm/llvm-project/pull/78032

>From d0767350f26215e86dee039427183630b3f02668 Mon Sep 17 00:00:00 2001
From: hnakamura5 <hnakamura5 at outlook.com>
Date: Sat, 13 Jan 2024 21:44:34 +0900
Subject: [PATCH 1/4] [clang-format] TableGen multi line string support.

---
 clang/lib/Format/ContinuationIndenter.cpp     |  3 +
 clang/lib/Format/FormatToken.h                |  1 +
 clang/lib/Format/FormatTokenLexer.cpp         | 57 +++++++++++++++++++
 clang/lib/Format/FormatTokenLexer.h           |  3 +
 clang/lib/Format/TokenAnnotator.cpp           |  2 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp |  5 ++
 6 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 102504182c4505..e6eaaa9ab45706 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1591,6 +1591,9 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State,
     State.StartOfStringLiteral = State.Column + 1;
   if (Current.is(TT_CSharpStringLiteral) && State.StartOfStringLiteral == 0) {
     State.StartOfStringLiteral = State.Column + 1;
+  } else if (Current.is(TT_TableGenMultiLineString) &&
+             State.StartOfStringLiteral == 0) {
+    State.StartOfStringLiteral = State.Column + 1;
   } else if (Current.isStringLiteral() && State.StartOfStringLiteral == 0) {
     State.StartOfStringLiteral = State.Column;
   } else if (!Current.isOneOf(tok::comment, tok::identifier, tok::hash) &&
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index d5ef627f1348d3..dede89f2600150 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -148,6 +148,7 @@ namespace format {
   TYPE(StructLBrace)                                                           \
   TYPE(StructRBrace)                                                           \
   TYPE(StructuredBindingLSquare)                                               \
+  TYPE(TableGenMultiLineString)                                                \
   TYPE(TemplateCloser)                                                         \
   TYPE(TemplateOpener)                                                         \
   TYPE(TemplateString)                                                         \
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index a1fd6dd6effe6c..1060009bdcf131 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -93,6 +93,8 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
       // string literals are correctly identified.
       handleCSharpVerbatimAndInterpolatedStrings();
     }
+    if (Style.isTableGen())
+      handleTableGenMultilineString();
     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
       FirstInLineIndex = Tokens.size() - 1;
   } while (Tokens.back()->isNot(tok::eof));
@@ -272,6 +274,14 @@ void FormatTokenLexer::tryMergePreviousTokens() {
       return;
     }
   }
+  if (Style.isTableGen()) {
+    if (tryMergeTokens({tok::l_square, tok::l_brace},
+                       TT_TableGenMultiLineString)) {
+      // Multi line string starts with [{
+      Tokens.back()->Tok.setKind(tok::string_literal);
+      return;
+    }
+  }
 }
 
 bool FormatTokenLexer::tryMergeNSStringLiteral() {
@@ -763,6 +773,53 @@ void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
 }
 
+void FormatTokenLexer::handleTableGenMultilineString() {
+  FormatToken *MultiLineString = Tokens.back();
+  if (MultiLineString->isNot(TT_TableGenMultiLineString))
+    return;
+
+  bool PrevIsRBrace = false;
+  const char *FirstBreak = nullptr;
+  const char *LastBreak = nullptr;
+  const char *Begin = MultiLineString->TokenText.begin();
+  // Skip until }], the closer of multi line string found.
+  for (const char *Current = Begin, *End = Lex->getBuffer().end();
+       Current != End; ++Current) {
+    if (PrevIsRBrace && *Current == ']') {
+      // }] is the end of multi line string.
+      if (!FirstBreak)
+        FirstBreak = Current;
+      MultiLineString->TokenText = StringRef(Begin, Current - Begin + 1);
+      // ColumnWidth is only the width of the first line.
+      MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
+          StringRef(Begin, FirstBreak - Begin + 1),
+          MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
+      if (LastBreak) {
+        // Set LastLineColumnWidth if multi line string has multiple lines.
+        MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
+            StringRef(LastBreak + 1, Current - LastBreak),
+            MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
+      }
+      resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Current + 1)));
+      return;
+    }
+    PrevIsRBrace = false;
+    if (*Current == '\n') {
+      MultiLineString->IsMultiline = true;
+      // Assure LastBreak is not equal to FirstBreak.
+      if (!FirstBreak)
+        FirstBreak = Current;
+      LastBreak = Current;
+      continue;
+    }
+    if (*Current == '}') {
+      // Memorize '}'. If next character is ']', they are the closer.
+      PrevIsRBrace = true;
+      continue;
+    }
+  }
+}
+
 void FormatTokenLexer::handleTemplateStrings() {
   FormatToken *BacktickToken = Tokens.back();
 
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index bb6a8ab69c1be1..1dec6bbc41514c 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -95,6 +95,9 @@ class FormatTokenLexer {
 
   void handleCSharpVerbatimAndInterpolatedStrings();
 
+  // Handles TableGen multiline strings. It has the form [{ ... }].
+  void handleTableGenMultilineString();
+
   void tryParsePythonComment();
 
   bool tryMerge_TMacro();
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 24ce18a64348c1..661118970336a2 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1710,7 +1710,7 @@ class AnnotatingParser {
             TT_UnionLBrace, TT_RequiresClause,
             TT_RequiresClauseInARequiresExpression, TT_RequiresExpression,
             TT_RequiresExpressionLParen, TT_RequiresExpressionLBrace,
-            TT_BracedListLBrace)) {
+            TT_BracedListLBrace, TT_TableGenMultiLineString)) {
       CurrentToken->setType(TT_Unknown);
     }
     CurrentToken->Role.reset();
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 92f57a77cdaf01..5ca6a76f840bdf 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2193,6 +2193,11 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   ASSERT_TRUE(Keywords.isTableGenDefinition(*Tokens[0]));
   ASSERT_TRUE(Tokens[0]->is(Keywords.kw_def));
   ASSERT_TRUE(Tokens[1]->is(TT_StartOfName));
+
+  // Code, the multiline string token.
+  Tokens = Annotate("[{ code is multiline string }]");
+  ASSERT_EQ(Tokens.size(), 2u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::string_literal, TT_TableGenMultiLineString);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {

>From bba0a09813b68afa5e7d0eb90da71d7e7453456b Mon Sep 17 00:00:00 2001
From: hnakamura5 <hnakamura5 at outlook.com>
Date: Sun, 14 Jan 2024 14:31:23 +0900
Subject: [PATCH 2/4] Fixed the reviewed points.

---
 clang/lib/Format/FormatToken.h                |  1 +
 clang/lib/Format/FormatTokenLexer.cpp         | 78 ++++++++-----------
 clang/lib/Format/TokenAnnotator.cpp           |  2 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 11 +++
 4 files changed, 45 insertions(+), 47 deletions(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index dede89f2600150..666245f401a204 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -434,6 +434,7 @@ struct FormatToken {
     setType(T);
   }
   bool isTypeFinalized() const { return TypeIsFinalized; }
+  void setTypeIsFinalized() { TypeIsFinalized = true; }
 
   /// Used to set an operator precedence explicitly.
   prec::Level ForcedPrecedence = prec::Unknown;
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 1060009bdcf131..cf8d4193029cb4 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -274,13 +274,13 @@ void FormatTokenLexer::tryMergePreviousTokens() {
       return;
     }
   }
-  if (Style.isTableGen()) {
-    if (tryMergeTokens({tok::l_square, tok::l_brace},
-                       TT_TableGenMultiLineString)) {
-      // Multi line string starts with [{
-      Tokens.back()->Tok.setKind(tok::string_literal);
-      return;
-    }
+  // TableGen's Multi line string starts with [{
+  if (Style.isTableGen() && tryMergeTokens({tok::l_square, tok::l_brace},
+                                           TT_TableGenMultiLineString)) {
+    // This must never be annotated as other types.
+    Tokens.back()->setTypeIsFinalized();
+    Tokens.back()->Tok.setKind(tok::string_literal);
+    return;
   }
 }
 
@@ -778,45 +778,31 @@ void FormatTokenLexer::handleTableGenMultilineString() {
   if (MultiLineString->isNot(TT_TableGenMultiLineString))
     return;
 
-  bool PrevIsRBrace = false;
-  const char *FirstBreak = nullptr;
-  const char *LastBreak = nullptr;
-  const char *Begin = MultiLineString->TokenText.begin();
-  // Skip until }], the closer of multi line string found.
-  for (const char *Current = Begin, *End = Lex->getBuffer().end();
-       Current != End; ++Current) {
-    if (PrevIsRBrace && *Current == ']') {
-      // }] is the end of multi line string.
-      if (!FirstBreak)
-        FirstBreak = Current;
-      MultiLineString->TokenText = StringRef(Begin, Current - Begin + 1);
-      // ColumnWidth is only the width of the first line.
-      MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
-          StringRef(Begin, FirstBreak - Begin + 1),
-          MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
-      if (LastBreak) {
-        // Set LastLineColumnWidth if multi line string has multiple lines.
-        MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
-            StringRef(LastBreak + 1, Current - LastBreak),
-            MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
-      }
-      resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Current + 1)));
-      return;
-    }
-    PrevIsRBrace = false;
-    if (*Current == '\n') {
-      MultiLineString->IsMultiline = true;
-      // Assure LastBreak is not equal to FirstBreak.
-      if (!FirstBreak)
-        FirstBreak = Current;
-      LastBreak = Current;
-      continue;
-    }
-    if (*Current == '}') {
-      // Memorize '}'. If next character is ']', they are the closer.
-      PrevIsRBrace = true;
-      continue;
-    }
+  auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
+  // "}]" is the end of multi line string.
+  auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
+  if (CloseOffset == StringRef::npos)
+    return;
+  auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset + 2);
+  MultiLineString->TokenText = Text;
+  resetLexer(SourceMgr.getFileOffset(
+      Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
+  // Set ColumnWidth and LastLineColumnWidth.
+  auto FirstLineText = Text;
+  auto FirstBreak = Text.find('\n');
+  if (FirstBreak != StringRef::npos) {
+    MultiLineString->IsMultiline = true;
+    FirstLineText = Text.substr(0, FirstBreak + 1);
+  }
+  // ColumnWidth holds only the width of the first line.
+  MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
+      FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
+  auto LastBreak = Text.rfind('\n');
+  if (LastBreak != StringRef::npos) {
+    // Set LastLineColumnWidth if it has multiple lines.
+    MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
+        Text.substr(LastBreak + 1, Text.size()),
+        MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
   }
 }
 
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 661118970336a2..24ce18a64348c1 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -1710,7 +1710,7 @@ class AnnotatingParser {
             TT_UnionLBrace, TT_RequiresClause,
             TT_RequiresClauseInARequiresExpression, TT_RequiresExpression,
             TT_RequiresExpressionLParen, TT_RequiresExpressionLBrace,
-            TT_BracedListLBrace, TT_TableGenMultiLineString)) {
+            TT_BracedListLBrace)) {
       CurrentToken->setType(TT_Unknown);
     }
     CurrentToken->Role.reset();
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 5ca6a76f840bdf..117d8fe8f7dc12 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2198,6 +2198,17 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   Tokens = Annotate("[{ code is multiline string }]");
   ASSERT_EQ(Tokens.size(), 2u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::string_literal, TT_TableGenMultiLineString);
+  EXPECT_FALSE(Tokens[0]->IsMultiline);
+  // Case with multiple lines.
+  Tokens = Annotate("[{ It can break\n"
+                    "   across lines and the line breaks\n"
+                    "   are retained in \n"
+                    "   the string. }]");
+  ASSERT_EQ(Tokens.size(), 2u) << Tokens;
+  EXPECT_TOKEN(Tokens[0], tok::string_literal, TT_TableGenMultiLineString);
+  EXPECT_EQ(Tokens[0]->ColumnWidth, sizeof("[{ It can break\n") - 1);
+  EXPECT_TRUE(Tokens[0]->IsMultiline);
+  EXPECT_EQ(Tokens[0]->LastLineColumnWidth, sizeof("   the string. }]") - 1);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {

>From 137f614b4e0ffbe0f6305db7b6eb97fd44240d5e Mon Sep 17 00:00:00 2001
From: hnakamura5 <hnakamura5 at outlook.com>
Date: Mon, 15 Jan 2024 21:38:31 +0900
Subject: [PATCH 3/4] Fixed the revied points of redundant substr's param and
 line detection

---
 clang/lib/Format/FormatTokenLexer.cpp | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index cf8d4193029cb4..1e7475d50c0247 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -787,23 +787,21 @@ void FormatTokenLexer::handleTableGenMultilineString() {
   MultiLineString->TokenText = Text;
   resetLexer(SourceMgr.getFileOffset(
       Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
-  // Set ColumnWidth and LastLineColumnWidth.
   auto FirstLineText = Text;
   auto FirstBreak = Text.find('\n');
+  // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
   if (FirstBreak != StringRef::npos) {
     MultiLineString->IsMultiline = true;
     FirstLineText = Text.substr(0, FirstBreak + 1);
+    // LastLineColumnWidth holds the width of the last line.
+    auto LastBreak = Text.rfind('\n');
+    MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
+        Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
+        Style.TabWidth, Encoding);
   }
   // ColumnWidth holds only the width of the first line.
   MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
       FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
-  auto LastBreak = Text.rfind('\n');
-  if (LastBreak != StringRef::npos) {
-    // Set LastLineColumnWidth if it has multiple lines.
-    MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
-        Text.substr(LastBreak + 1, Text.size()),
-        MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
-  }
 }
 
 void FormatTokenLexer::handleTemplateStrings() {

>From fce68fa403e92a4eb97605944751aeac9e0ac3c1 Mon Sep 17 00:00:00 2001
From: hnakamura5 <hnakamura5 at outlook.com>
Date: Tue, 16 Jan 2024 20:58:35 +0900
Subject: [PATCH 4/4] Removed SetTypeIsFinalized.

---
 clang/lib/Format/FormatToken.h        | 1 -
 clang/lib/Format/FormatTokenLexer.cpp | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 666245f401a204..dede89f2600150 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -434,7 +434,6 @@ struct FormatToken {
     setType(T);
   }
   bool isTypeFinalized() const { return TypeIsFinalized; }
-  void setTypeIsFinalized() { TypeIsFinalized = true; }
 
   /// Used to set an operator precedence explicitly.
   prec::Level ForcedPrecedence = prec::Unknown;
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 1e7475d50c0247..25ac9be57c81a9 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -277,8 +277,8 @@ void FormatTokenLexer::tryMergePreviousTokens() {
   // TableGen's Multi line string starts with [{
   if (Style.isTableGen() && tryMergeTokens({tok::l_square, tok::l_brace},
                                            TT_TableGenMultiLineString)) {
-    // This must never be annotated as other types.
-    Tokens.back()->setTypeIsFinalized();
+    // Set again with finalizing. This must never be annotated as other types.
+    Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
     Tokens.back()->Tok.setKind(tok::string_literal);
     return;
   }



More information about the cfe-commits mailing list