[llvm] [TableGen] Remove explicit recursion in LexToken (PR #143697)

Wed Jun 11 07:58:27 PDT 2025

https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/143697

>From b08e3994d93440791b3733f073dd0e1869d42955 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 11 Jun 2025 13:56:04 +0100
Subject: [PATCH 1/2] [TableGen] Remove explicit recursion in LexToken

When profiling a Release+Asserts build of llvm-tblgen I noticed that it
was recursing hundreds of times to lex a sequence of hundreds of space
characters.
---
 llvm/lib/TableGen/TGLexer.cpp | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 46487cba9453e..5a3f86372beca 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -174,6 +174,7 @@ int TGLexer::peekNextChar(int Index) const {
 }
 
 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
+restart:
   TokStart = CurPtr;
   // This always consumes at least one character.
   int CurChar = getNextChar();
@@ -188,12 +189,12 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
     return ReturnError(TokStart, "unexpected character");
   case EOF:
     // Lex next token, if we just left an include file.
-    // Note that leaving an include file means that the next
-    // symbol is located at the end of the 'include "..."'
-    // construct, so LexToken() is called with default
-    // false parameter.
-    if (processEOF())
-      return LexToken();
+    if (processEOF()) {
+      // Leaving an include file means that the next symbol is located at the
+      // end of the 'include "..."'  construct.
+      FileOrLineStart = false;
+      goto restart;
+    }
 
     // Return EOF denoting the end of lexing.
     return tgtok::Eof;
@@ -238,10 +239,11 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
   case ' ':
   case '\t':
     // Ignore whitespace.
-    return LexToken(FileOrLineStart);
+    goto restart;
   case '\n':
     // Ignore whitespace, and identify the new line.
-    return LexToken(true);
+    FileOrLineStart = true;
+    goto restart;
   case '/':
     // If this is the start of a // comment, skip until the end of the line or
     // the end of the buffer.
@@ -252,7 +254,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
         return tgtok::Error;
     } else // Otherwise, this is an error.
       return ReturnError(TokStart, "unexpected character");
-    return LexToken(FileOrLineStart);
+    goto restart;
   case '-': case '+':
   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
   case '7': case '8': case '9': {

>From 781fc1b5d8e5c3de1c5c63c1751c3ee333e7e619 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 11 Jun 2025 15:57:02 +0100
Subject: [PATCH 2/2] Use `while (true)` and `break`

---
 llvm/lib/TableGen/TGLexer.cpp | 269 ++++++++++++++++++++--------------
 1 file changed, 156 insertions(+), 113 deletions(-)

diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 5a3f86372beca..8ceb72694bdec 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -174,131 +174,174 @@ int TGLexer::peekNextChar(int Index) const {
 }
 
 tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
-restart:
-  TokStart = CurPtr;
-  // This always consumes at least one character.
-  int CurChar = getNextChar();
+  while (true) {
+    TokStart = CurPtr;
+    // This always consumes at least one character.
+    int CurChar = getNextChar();
 
-  switch (CurChar) {
-  default:
-    // Handle letters: [a-zA-Z_]
-    if (isValidIDChar(CurChar, /*First=*/true))
-      return LexIdentifier();
-
-    // Unknown character, emit an error.
-    return ReturnError(TokStart, "unexpected character");
-  case EOF:
-    // Lex next token, if we just left an include file.
-    if (processEOF()) {
-      // Leaving an include file means that the next symbol is located at the
-      // end of the 'include "..."'  construct.
-      FileOrLineStart = false;
-      goto restart;
-    }
+    switch (CurChar) {
+    default:
+      // Handle letters: [a-zA-Z_]
+      if (isValidIDChar(CurChar, /*First=*/true))
+        return LexIdentifier();
 
-    // Return EOF denoting the end of lexing.
-    return tgtok::Eof;
-
-  case ':': return tgtok::colon;
-  case ';': return tgtok::semi;
-  case ',': return tgtok::comma;
-  case '<': return tgtok::less;
-  case '>': return tgtok::greater;
-  case ']': return tgtok::r_square;
-  case '{': return tgtok::l_brace;
-  case '}': return tgtok::r_brace;
-  case '(': return tgtok::l_paren;
-  case ')': return tgtok::r_paren;
-  case '=': return tgtok::equal;
-  case '?': return tgtok::question;
-  case '#':
-    if (FileOrLineStart) {
-      tgtok::TokKind Kind = prepIsDirective();
-      if (Kind != tgtok::Error)
-        return lexPreprocessor(Kind);
-    }
+      // Unknown character, emit an error.
+      return ReturnError(TokStart, "unexpected character");
+    case EOF:
+      // Lex next token, if we just left an include file.
+      if (processEOF()) {
+        // Leaving an include file means that the next symbol is located at the
+        // end of the 'include "..."'  construct.
+        FileOrLineStart = false;
+        break;
+      }
 
-    return tgtok::paste;
+      // Return EOF denoting the end of lexing.
+      return tgtok::Eof;
+
+    case ':':
+      return tgtok::colon;
+    case ';':
+      return tgtok::semi;
+    case ',':
+      return tgtok::comma;
+    case '<':
+      return tgtok::less;
+    case '>':
+      return tgtok::greater;
+    case ']':
+      return tgtok::r_square;
+    case '{':
+      return tgtok::l_brace;
+    case '}':
+      return tgtok::r_brace;
+    case '(':
+      return tgtok::l_paren;
+    case ')':
+      return tgtok::r_paren;
+    case '=':
+      return tgtok::equal;
+    case '?':
+      return tgtok::question;
+    case '#':
+      if (FileOrLineStart) {
+        tgtok::TokKind Kind = prepIsDirective();
+        if (Kind != tgtok::Error)
+          return lexPreprocessor(Kind);
+      }
+
+      return tgtok::paste;
 
-  // The period is a separate case so we can recognize the "..."
-  // range punctuator.
-  case '.':
-    if (peekNextChar(0) == '.') {
-      ++CurPtr; // Eat second dot.
+      // The period is a separate case so we can recognize the "..."
+      // range punctuator.
+    case '.':
       if (peekNextChar(0) == '.') {
-        ++CurPtr; // Eat third dot.
-        return tgtok::dotdotdot;
+        ++CurPtr; // Eat second dot.
+        if (peekNextChar(0) == '.') {
+          ++CurPtr; // Eat third dot.
+          return tgtok::dotdotdot;
+        }
+        return ReturnError(TokStart, "invalid '..' punctuation");
       }
-      return ReturnError(TokStart, "invalid '..' punctuation");
-    }
-    return tgtok::dot;
+      return tgtok::dot;
 
-  case '\r':
-    llvm_unreachable("getNextChar() must never return '\r'");
+    case '\r':
+      llvm_unreachable("getNextChar() must never return '\r'");
 
-  case ' ':
-  case '\t':
-    // Ignore whitespace.
-    goto restart;
-  case '\n':
-    // Ignore whitespace, and identify the new line.
-    FileOrLineStart = true;
-    goto restart;
-  case '/':
-    // If this is the start of a // comment, skip until the end of the line or
-    // the end of the buffer.
-    if (*CurPtr == '/')
-      SkipBCPLComment();
-    else if (*CurPtr == '*') {
-      if (SkipCComment())
-        return tgtok::Error;
-    } else // Otherwise, this is an error.
-      return ReturnError(TokStart, "unexpected character");
-    goto restart;
-  case '-': case '+':
-  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
-  case '7': case '8': case '9': {
-    int NextChar = 0;
-    if (isDigit(CurChar)) {
-      // Allow identifiers to start with a number if it is followed by
-      // an identifier.  This can happen with paste operations like
-      // foo#8i.
-      int i = 0;
-      do {
-        NextChar = peekNextChar(i++);
-      } while (isDigit(NextChar));
-
-      if (NextChar == 'x' || NextChar == 'b') {
-        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
-        // likely a number.
-        int NextNextChar = peekNextChar(i);
-        switch (NextNextChar) {
-        default:
-          break;
-        case '0': case '1':
-          if (NextChar == 'b')
-            return LexNumber();
-          [[fallthrough]];
-        case '2': case '3': case '4': case '5':
-        case '6': case '7': case '8': case '9':
-        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
-        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
-          if (NextChar == 'x')
-            return LexNumber();
-          break;
+    case ' ':
+    case '\t':
+      // Ignore whitespace.
+      break;
+    case '\n':
+      // Ignore whitespace, and identify the new line.
+      FileOrLineStart = true;
+      break;
+    case '/':
+      // If this is the start of a // comment, skip until the end of the line or
+      // the end of the buffer.
+      if (*CurPtr == '/')
+        SkipBCPLComment();
+      else if (*CurPtr == '*') {
+        if (SkipCComment())
+          return tgtok::Error;
+      } else // Otherwise, this is an error.
+        return ReturnError(TokStart, "unexpected character");
+      break;
+    case '-':
+    case '+':
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9': {
+      int NextChar = 0;
+      if (isDigit(CurChar)) {
+        // Allow identifiers to start with a number if it is followed by
+        // an identifier.  This can happen with paste operations like
+        // foo#8i.
+        int i = 0;
+        do {
+          NextChar = peekNextChar(i++);
+        } while (isDigit(NextChar));
+
+        if (NextChar == 'x' || NextChar == 'b') {
+          // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
+          // likely a number.
+          int NextNextChar = peekNextChar(i);
+          switch (NextNextChar) {
+          default:
+            break;
+          case '0':
+          case '1':
+            if (NextChar == 'b')
+              return LexNumber();
+            [[fallthrough]];
+          case '2':
+          case '3':
+          case '4':
+          case '5':
+          case '6':
+          case '7':
+          case '8':
+          case '9':
+          case 'a':
+          case 'b':
+          case 'c':
+          case 'd':
+          case 'e':
+          case 'f':
+          case 'A':
+          case 'B':
+          case 'C':
+          case 'D':
+          case 'E':
+          case 'F':
+            if (NextChar == 'x')
+              return LexNumber();
+            break;
+          }
         }
       }
-    }
 
-    if (isValidIDChar(NextChar, /*First=*/true))
-      return LexIdentifier();
+      if (isValidIDChar(NextChar, /*First=*/true))
+        return LexIdentifier();
 
-    return LexNumber();
-  }
-  case '"': return LexString();
-  case '$': return LexVarName();
-  case '[': return LexBracket();
-  case '!': return LexExclaim();
+      return LexNumber();
+    }
+    case '"':
+      return LexString();
+    case '$':
+      return LexVarName();
+    case '[':
+      return LexBracket();
+    case '!':
+      return LexExclaim();
+    }
   }
 }