[clang] [clang-format] Support of TableGen identifiers beginning with a number. (PR #78571)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Jan 18 04:58:11 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-clang-format
Author: Hirofumi Nakamura (hnakamura5)
<details>
<summary>Changes</summary>
TableGen allows the identifiers beginning with a number.
This patch add the support of the recognition of such identifiers.
---
Full diff: https://github.com/llvm/llvm-project/pull/78571.diff
3 Files Affected:
- (modified) clang/lib/Format/FormatTokenLexer.cpp (+43-1)
- (modified) clang/lib/Format/FormatTokenLexer.h (+4)
- (modified) clang/unittests/Format/TokenAnnotatorTest.cpp (+21)
``````````diff
diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 25ac9be57c81a9..f1982533f112c7 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -93,8 +93,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
// string literals are correctly identified.
handleCSharpVerbatimAndInterpolatedStrings();
}
- if (Style.isTableGen())
+ if (Style.isTableGen()) {
handleTableGenMultilineString();
+ handleTableGenNumericLikeIdentifier();
+ }
if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
FirstInLineIndex = Tokens.size() - 1;
} while (Tokens.back()->isNot(tok::eof));
@@ -804,6 +806,46 @@ void FormatTokenLexer::handleTableGenMultilineString() {
FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
}
+void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
+ FormatToken *Tok = Tokens.back();
+ // TableGen identifiers can begin with digits. Such tokens are lexed as
+ // numeric_constant now.
+ if (Tok->isNot(tok::numeric_constant))
+ return;
+ StringRef Text = Tok->TokenText;
+ // Identifiers cannot begin with + or -.
+ if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
+ return;
+ // The following check is based on llvm::TGLexer::LexToken.
+ if (isdigit(Text[0])) {
+ size_t I = 0;
+ char NextChar = (char)0;
+ // Identifiers in TalbleGen may begin with digits. Skip to first non-digit.
+ do {
+ NextChar = Text[I++];
+ } while (I < Text.size() && isdigit(NextChar));
+ // All the characters are digits.
+ if (I >= Text.size())
+ return;
+ // Base character. But it does not check the first 0 and that the base is
+ // the second character.
+ if (NextChar == 'x' || NextChar == 'b') {
+ char NextNextChar = Text[I];
+ // This is regarded as binary number.
+ if (isxdigit(NextNextChar)) {
+ if (NextChar == 'b' && (NextNextChar == '0' || NextNextChar == '1'))
+ return;
+ // Regarded as hex number or decimal number.
+ if (NextChar == 'x' || isdigit(NextNextChar))
+ return;
+ }
+ }
+ }
+ // Otherwise, this is actually a identifier.
+ Tok->Tok.setKind(tok::identifier);
+ Tok->Tok.setIdentifierInfo(nullptr);
+}
+
void FormatTokenLexer::handleTemplateStrings() {
FormatToken *BacktickToken = Tokens.back();
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 1dec6bbc41514c..65dd733bd53352 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -97,6 +97,10 @@ class FormatTokenLexer {
// Handles TableGen multiline strings. It has the form [{ ... }].
void handleTableGenMultilineString();
+ // Handles TableGen numeric like identifiers.
+ // They have a forms of [0-9]*[_a-zA-Z]([_a-zA-Z0-9]*). But limited to the
+ // case it is not lexed as an integer.
+ void handleTableGenNumericLikeIdentifier();
void tryParsePythonComment();
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 117d8fe8f7dc12..753e749befa57e 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2209,6 +2209,27 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
EXPECT_EQ(Tokens[0]->ColumnWidth, sizeof("[{ It can break\n") - 1);
EXPECT_TRUE(Tokens[0]->IsMultiline);
EXPECT_EQ(Tokens[0]->LastLineColumnWidth, sizeof(" the string. }]") - 1);
+
+ // Identifier tokens. In TableGen, identifiers can begin with a number.
+ // In ambiguous cases, the lexer tries to lex it as a number.
+ // Even if the try fails, it does not fall back to identifier lexing and
+ // regard as an error.
+ // The ambiguity is not documented. The result of those tests are based on the
+ // implementation of llvm::TGLexer::LexToken.
+ Tokens = Annotate("1234");
+ EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+ Tokens = Annotate("0x1abC");
+ EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+ // This is invalid syntax of number, but not an identifier.
+ Tokens = Annotate("0x1234x");
+ EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+ Tokens = Annotate("identifier");
+ EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+ // Identifier beginning with a number.
+ Tokens = Annotate("2dVector");
+ EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+ Tokens = Annotate("01234Vector");
+ EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
}
TEST_F(TokenAnnotatorTest, UnderstandConstructors) {
``````````
</details>
https://github.com/llvm/llvm-project/pull/78571
More information about the cfe-commits
mailing list