[clang] [clang-format] Support of TableGen identifiers beginning with a number. (PR #78571)

Fri Jan 19 07:35:19 PST 2024

https://github.com/hnakamura5 updated https://github.com/llvm/llvm-project/pull/78571

>From b472c08735b3ce3b6f7b81e499a2ef16c3faad4a Mon Sep 17 00:00:00 2001
From: hnakamura5 <hnakamura5 at outlook.com>
Date: Thu, 18 Jan 2024 21:49:06 +0900
Subject: [PATCH 1/2] [clang-format] Support of TableGen identifiers beginning
 with a number.

---
 clang/lib/Format/FormatTokenLexer.cpp         | 44 ++++++++++++++++++-
 clang/lib/Format/FormatTokenLexer.h           |  4 ++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 21 +++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 25ac9be57c81a9a..f1982533f112c75 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -93,8 +93,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
       // string literals are correctly identified.
       handleCSharpVerbatimAndInterpolatedStrings();
     }
-    if (Style.isTableGen())
+    if (Style.isTableGen()) {
       handleTableGenMultilineString();
+      handleTableGenNumericLikeIdentifier();
+    }
     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
       FirstInLineIndex = Tokens.size() - 1;
   } while (Tokens.back()->isNot(tok::eof));
@@ -804,6 +806,46 @@ void FormatTokenLexer::handleTableGenMultilineString() {
       FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
 }
 
+void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
+  FormatToken *Tok = Tokens.back();
+  // TableGen identifiers can begin with digits. Such tokens are lexed as
+  // numeric_constant now.
+  if (Tok->isNot(tok::numeric_constant))
+    return;
+  StringRef Text = Tok->TokenText;
+  // Identifiers cannot begin with + or -.
+  if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
+    return;
+  // The following check is based on llvm::TGLexer::LexToken.
+  if (isdigit(Text[0])) {
+    size_t I = 0;
+    char NextChar = (char)0;
+    // Identifiers in TalbleGen may begin with digits. Skip to first non-digit.
+    do {
+      NextChar = Text[I++];
+    } while (I < Text.size() && isdigit(NextChar));
+    // All the characters are digits.
+    if (I >= Text.size())
+      return;
+    // Base character. But it does not check the first 0 and that the base is
+    // the second character.
+    if (NextChar == 'x' || NextChar == 'b') {
+      char NextNextChar = Text[I];
+      // This is regarded as binary number.
+      if (isxdigit(NextNextChar)) {
+        if (NextChar == 'b' && (NextNextChar == '0' || NextNextChar == '1'))
+          return;
+        // Regarded as hex number or decimal number.
+        if (NextChar == 'x' || isdigit(NextNextChar))
+          return;
+      }
+    }
+  }
+  // Otherwise, this is actually a identifier.
+  Tok->Tok.setKind(tok::identifier);
+  Tok->Tok.setIdentifierInfo(nullptr);
+}
+
 void FormatTokenLexer::handleTemplateStrings() {
   FormatToken *BacktickToken = Tokens.back();
 
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 1dec6bbc41514cd..65dd733bd53352a 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -97,6 +97,10 @@ class FormatTokenLexer {
 
   // Handles TableGen multiline strings. It has the form [{ ... }].
   void handleTableGenMultilineString();
+  // Handles TableGen numeric like identifiers.
+  // They have a forms of [0-9]*[_a-zA-Z]([_a-zA-Z0-9]*). But limited to the
+  // case it is not lexed as an integer.
+  void handleTableGenNumericLikeIdentifier();
 
   void tryParsePythonComment();
 
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 117d8fe8f7dc12e..753e749befa57e9 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2209,6 +2209,27 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   EXPECT_EQ(Tokens[0]->ColumnWidth, sizeof("[{ It can break\n") - 1);
   EXPECT_TRUE(Tokens[0]->IsMultiline);
   EXPECT_EQ(Tokens[0]->LastLineColumnWidth, sizeof("   the string. }]") - 1);
+
+  // Identifier tokens. In TableGen, identifiers can begin with a number.
+  // In ambiguous cases, the lexer tries to lex it as a number.
+  // Even if the try fails, it does not fall back to identifier lexing and
+  // regard as an error.
+  // The ambiguity is not documented. The result of those tests are based on the
+  // implementation of llvm::TGLexer::LexToken.
+  Tokens = Annotate("1234");
+  EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+  Tokens = Annotate("0x1abC");
+  EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+  // This is invalid syntax of number, but not an identifier.
+  Tokens = Annotate("0x1234x");
+  EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+  Tokens = Annotate("identifier");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  // Identifier beginning with a number.
+  Tokens = Annotate("2dVector");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  Tokens = Annotate("01234Vector");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {

>From 70e6c7cf3f41571697ce5eb37ed1bc20d161b247 Mon Sep 17 00:00:00 2001
From: hnakamura5 <hnakamura5 at outlook.com>
Date: Sat, 20 Jan 2024 00:30:59 +0900
Subject: [PATCH 2/2] Changed the algorithm of
 handleTableGenNumericLikeIdentifier

---
 clang/lib/Format/FormatTokenLexer.cpp         | 52 +++++++++----------
 clang/unittests/Format/TokenAnnotatorTest.cpp |  2 +
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index f1982533f112c75..52a55ea23b5f2f7 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -813,37 +813,35 @@ void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
   if (Tok->isNot(tok::numeric_constant))
     return;
   StringRef Text = Tok->TokenText;
-  // Identifiers cannot begin with + or -.
+  // The following check is based on llvm::TGLexer::LexToken.
+  // That lexes the token as a number if any of the following holds:
+  // 1. It starts with '+', '-'.
+  // 2. All the characters are digits.
+  // 3. The first non-digit character is 'b', and the next is '0' or '1'.
+  // 4. The first non-digit character is 'x', and the next is a hex digit.
+  // Note that in the case 3 and 4, if the next character does not exists in
+  // this token, the token is an identifier.
   if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
     return;
-  // The following check is based on llvm::TGLexer::LexToken.
-  if (isdigit(Text[0])) {
-    size_t I = 0;
-    char NextChar = (char)0;
-    // Identifiers in TalbleGen may begin with digits. Skip to first non-digit.
-    do {
-      NextChar = Text[I++];
-    } while (I < Text.size() && isdigit(NextChar));
-    // All the characters are digits.
-    if (I >= Text.size())
+  const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
+  // All the characters are digits
+  if (NonDigitPos == StringRef::npos)
+    return;
+  char FirstNonDigit = Text[NonDigitPos];
+  if (NonDigitPos < Text.size() - 1) {
+    char TheNext = Text[NonDigitPos + 1];
+    // Regarded as a binary number.
+    if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
       return;
-    // Base character. But it does not check the first 0 and that the base is
-    // the second character.
-    if (NextChar == 'x' || NextChar == 'b') {
-      char NextNextChar = Text[I];
-      // This is regarded as binary number.
-      if (isxdigit(NextNextChar)) {
-        if (NextChar == 'b' && (NextNextChar == '0' || NextNextChar == '1'))
-          return;
-        // Regarded as hex number or decimal number.
-        if (NextChar == 'x' || isdigit(NextNextChar))
-          return;
-      }
-    }
+    // Regarded as hex number.
+    if (FirstNonDigit == 'x' && isxdigit(TheNext))
+      return;
+  }
+  if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
+    // This is actually an identifier in TableGen.
+    Tok->Tok.setKind(tok::identifier);
+    Tok->Tok.setIdentifierInfo(nullptr);
   }
-  // Otherwise, this is actually a identifier.
-  Tok->Tok.setKind(tok::identifier);
-  Tok->Tok.setIdentifierInfo(nullptr);
 }
 
 void FormatTokenLexer::handleTemplateStrings() {
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 753e749befa57e9..64b2abac5cce531 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2226,6 +2226,8 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   Tokens = Annotate("identifier");
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
   // Identifier beginning with a number.
+  Tokens = Annotate("0x");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
   Tokens = Annotate("2dVector");
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
   Tokens = Annotate("01234Vector");