[llvm] [TableGen] Remove explicit recursion in LexToken (PR #143697)
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 11 07:58:27 PDT 2025
https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/143697
>From b08e3994d93440791b3733f073dd0e1869d42955 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 11 Jun 2025 13:56:04 +0100
Subject: [PATCH 1/2] [TableGen] Remove explicit recursion in LexToken
When profiling a Release+Asserts build of llvm-tblgen I noticed that it
was recursing hundreds of times to lex a sequence of hundreds of space
characters.
---
llvm/lib/TableGen/TGLexer.cpp | 20 +++++++++++---------
1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 46487cba9453e..5a3f86372beca 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -174,6 +174,7 @@ int TGLexer::peekNextChar(int Index) const {
}
tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
+restart:
TokStart = CurPtr;
// This always consumes at least one character.
int CurChar = getNextChar();
@@ -188,12 +189,12 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
return ReturnError(TokStart, "unexpected character");
case EOF:
// Lex next token, if we just left an include file.
- // Note that leaving an include file means that the next
- // symbol is located at the end of the 'include "..."'
- // construct, so LexToken() is called with default
- // false parameter.
- if (processEOF())
- return LexToken();
+ if (processEOF()) {
+ // Leaving an include file means that the next symbol is located at the
+ // end of the 'include "..."' construct.
+ FileOrLineStart = false;
+ goto restart;
+ }
// Return EOF denoting the end of lexing.
return tgtok::Eof;
@@ -238,10 +239,11 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
case ' ':
case '\t':
// Ignore whitespace.
- return LexToken(FileOrLineStart);
+ goto restart;
case '\n':
// Ignore whitespace, and identify the new line.
- return LexToken(true);
+ FileOrLineStart = true;
+ goto restart;
case '/':
// If this is the start of a // comment, skip until the end of the line or
// the end of the buffer.
@@ -252,7 +254,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
return tgtok::Error;
} else // Otherwise, this is an error.
return ReturnError(TokStart, "unexpected character");
- return LexToken(FileOrLineStart);
+ goto restart;
case '-': case '+':
case '0': case '1': case '2': case '3': case '4': case '5': case '6':
case '7': case '8': case '9': {
>From 781fc1b5d8e5c3de1c5c63c1751c3ee333e7e619 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 11 Jun 2025 15:57:02 +0100
Subject: [PATCH 2/2] Use `while (true)` and `break`
---
llvm/lib/TableGen/TGLexer.cpp | 269 ++++++++++++++++++++--------------
1 file changed, 156 insertions(+), 113 deletions(-)
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 5a3f86372beca..8ceb72694bdec 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -174,131 +174,174 @@ int TGLexer::peekNextChar(int Index) const {
}
tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
-restart:
- TokStart = CurPtr;
- // This always consumes at least one character.
- int CurChar = getNextChar();
+ while (true) {
+ TokStart = CurPtr;
+ // This always consumes at least one character.
+ int CurChar = getNextChar();
- switch (CurChar) {
- default:
- // Handle letters: [a-zA-Z_]
- if (isValidIDChar(CurChar, /*First=*/true))
- return LexIdentifier();
-
- // Unknown character, emit an error.
- return ReturnError(TokStart, "unexpected character");
- case EOF:
- // Lex next token, if we just left an include file.
- if (processEOF()) {
- // Leaving an include file means that the next symbol is located at the
- // end of the 'include "..."' construct.
- FileOrLineStart = false;
- goto restart;
- }
+ switch (CurChar) {
+ default:
+ // Handle letters: [a-zA-Z_]
+ if (isValidIDChar(CurChar, /*First=*/true))
+ return LexIdentifier();
- // Return EOF denoting the end of lexing.
- return tgtok::Eof;
-
- case ':': return tgtok::colon;
- case ';': return tgtok::semi;
- case ',': return tgtok::comma;
- case '<': return tgtok::less;
- case '>': return tgtok::greater;
- case ']': return tgtok::r_square;
- case '{': return tgtok::l_brace;
- case '}': return tgtok::r_brace;
- case '(': return tgtok::l_paren;
- case ')': return tgtok::r_paren;
- case '=': return tgtok::equal;
- case '?': return tgtok::question;
- case '#':
- if (FileOrLineStart) {
- tgtok::TokKind Kind = prepIsDirective();
- if (Kind != tgtok::Error)
- return lexPreprocessor(Kind);
- }
+ // Unknown character, emit an error.
+ return ReturnError(TokStart, "unexpected character");
+ case EOF:
+ // Lex next token, if we just left an include file.
+ if (processEOF()) {
+ // Leaving an include file means that the next symbol is located at the
+ // end of the 'include "..."' construct.
+ FileOrLineStart = false;
+ break;
+ }
- return tgtok::paste;
+ // Return EOF denoting the end of lexing.
+ return tgtok::Eof;
+
+ case ':':
+ return tgtok::colon;
+ case ';':
+ return tgtok::semi;
+ case ',':
+ return tgtok::comma;
+ case '<':
+ return tgtok::less;
+ case '>':
+ return tgtok::greater;
+ case ']':
+ return tgtok::r_square;
+ case '{':
+ return tgtok::l_brace;
+ case '}':
+ return tgtok::r_brace;
+ case '(':
+ return tgtok::l_paren;
+ case ')':
+ return tgtok::r_paren;
+ case '=':
+ return tgtok::equal;
+ case '?':
+ return tgtok::question;
+ case '#':
+ if (FileOrLineStart) {
+ tgtok::TokKind Kind = prepIsDirective();
+ if (Kind != tgtok::Error)
+ return lexPreprocessor(Kind);
+ }
+
+ return tgtok::paste;
- // The period is a separate case so we can recognize the "..."
- // range punctuator.
- case '.':
- if (peekNextChar(0) == '.') {
- ++CurPtr; // Eat second dot.
+ // The period is a separate case so we can recognize the "..."
+ // range punctuator.
+ case '.':
if (peekNextChar(0) == '.') {
- ++CurPtr; // Eat third dot.
- return tgtok::dotdotdot;
+ ++CurPtr; // Eat second dot.
+ if (peekNextChar(0) == '.') {
+ ++CurPtr; // Eat third dot.
+ return tgtok::dotdotdot;
+ }
+ return ReturnError(TokStart, "invalid '..' punctuation");
}
- return ReturnError(TokStart, "invalid '..' punctuation");
- }
- return tgtok::dot;
+ return tgtok::dot;
- case '\r':
- llvm_unreachable("getNextChar() must never return '\r'");
+ case '\r':
+ llvm_unreachable("getNextChar() must never return '\r'");
- case ' ':
- case '\t':
- // Ignore whitespace.
- goto restart;
- case '\n':
- // Ignore whitespace, and identify the new line.
- FileOrLineStart = true;
- goto restart;
- case '/':
- // If this is the start of a // comment, skip until the end of the line or
- // the end of the buffer.
- if (*CurPtr == '/')
- SkipBCPLComment();
- else if (*CurPtr == '*') {
- if (SkipCComment())
- return tgtok::Error;
- } else // Otherwise, this is an error.
- return ReturnError(TokStart, "unexpected character");
- goto restart;
- case '-': case '+':
- case '0': case '1': case '2': case '3': case '4': case '5': case '6':
- case '7': case '8': case '9': {
- int NextChar = 0;
- if (isDigit(CurChar)) {
- // Allow identifiers to start with a number if it is followed by
- // an identifier. This can happen with paste operations like
- // foo#8i.
- int i = 0;
- do {
- NextChar = peekNextChar(i++);
- } while (isDigit(NextChar));
-
- if (NextChar == 'x' || NextChar == 'b') {
- // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
- // likely a number.
- int NextNextChar = peekNextChar(i);
- switch (NextNextChar) {
- default:
- break;
- case '0': case '1':
- if (NextChar == 'b')
- return LexNumber();
- [[fallthrough]];
- case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9':
- case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
- case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
- if (NextChar == 'x')
- return LexNumber();
- break;
+ case ' ':
+ case '\t':
+ // Ignore whitespace.
+ break;
+ case '\n':
+ // Ignore whitespace, and identify the new line.
+ FileOrLineStart = true;
+ break;
+ case '/':
+ // If this is the start of a // comment, skip until the end of the line or
+ // the end of the buffer.
+ if (*CurPtr == '/')
+ SkipBCPLComment();
+ else if (*CurPtr == '*') {
+ if (SkipCComment())
+ return tgtok::Error;
+ } else // Otherwise, this is an error.
+ return ReturnError(TokStart, "unexpected character");
+ break;
+ case '-':
+ case '+':
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9': {
+ int NextChar = 0;
+ if (isDigit(CurChar)) {
+ // Allow identifiers to start with a number if it is followed by
+ // an identifier. This can happen with paste operations like
+ // foo#8i.
+ int i = 0;
+ do {
+ NextChar = peekNextChar(i++);
+ } while (isDigit(NextChar));
+
+ if (NextChar == 'x' || NextChar == 'b') {
+ // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
+ // likely a number.
+ int NextNextChar = peekNextChar(i);
+ switch (NextNextChar) {
+ default:
+ break;
+ case '0':
+ case '1':
+ if (NextChar == 'b')
+ return LexNumber();
+ [[fallthrough]];
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ case 'a':
+ case 'b':
+ case 'c':
+ case 'd':
+ case 'e':
+ case 'f':
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'D':
+ case 'E':
+ case 'F':
+ if (NextChar == 'x')
+ return LexNumber();
+ break;
+ }
}
}
- }
- if (isValidIDChar(NextChar, /*First=*/true))
- return LexIdentifier();
+ if (isValidIDChar(NextChar, /*First=*/true))
+ return LexIdentifier();
- return LexNumber();
- }
- case '"': return LexString();
- case '$': return LexVarName();
- case '[': return LexBracket();
- case '!': return LexExclaim();
+ return LexNumber();
+ }
+ case '"':
+ return LexString();
+ case '$':
+ return LexVarName();
+ case '[':
+ return LexBracket();
+ case '!':
+ return LexExclaim();
+ }
}
}
More information about the llvm-commits
mailing list