[llvm] e6751bf - [NFC][TableGen] Elminate use of isalpha/isdigit from TGLexer (#104837)

Tue Aug 20 09:54:16 PDT 2024

Author: Rahul Joshi
Date: 2024-08-20T09:54:12-07:00
New Revision: e6751bfbf77bd9888b7882a46e2453e4ed149c7b

URL: https://github.com/llvm/llvm-project/commit/e6751bfbf77bd9888b7882a46e2453e4ed149c7b
DIFF: https://github.com/llvm/llvm-project/commit/e6751bfbf77bd9888b7882a46e2453e4ed149c7b.diff

LOG: [NFC][TableGen] Elminate use of isalpha/isdigit from TGLexer (#104837)

- Replace use of std::isalpha, std::isdigit, std:isxdigit with LLVM's
StringExtras versions, to avoid possibly locale dependent behavior (e.g.
glibc).
- Create helper function for common checks for valid identifier
characters.

Added: 
    

Modified: 
    llvm/lib/TableGen/TGLexer.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index c1fae91e1fd9f2..62a884e01a5306 100644

--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -12,6 +12,7 @@
 
 #include "TGLexer.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
@@ -20,7 +21,6 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/TableGen/Error.h"
 #include <algorithm>
-#include <cctype>
 #include <cerrno>
 #include <cstdint>
 #include <cstdio>
@@ -38,6 +38,17 @@ struct PreprocessorDir {
 };
 } // end anonymous namespace
 
+/// Returns true if `C` is a valid character in an identifier. If `First` is
+/// true, returns true if `C` is a valid first character of an identifier,
+/// else returns true if `C` is a valid non-first character of an identifier.
+/// Identifiers match the following regular expression:
+///   [a-zA-Z_][0-9a-zA-Z_]*
+static bool isValidIDChar(char C, bool First) {
+  if (C == '_' || isAlpha(C))
+    return true;
+  return !First && isDigit(C);
+}
+
 constexpr PreprocessorDir PreprocessorDirs[] = {{tgtok::Ifdef, "ifdef"},
                                                 {tgtok::Ifndef, "ifndef"},
                                                 {tgtok::Else, "else"},
@@ -51,14 +62,14 @@ static const char *lexMacroName(StringRef Str) {
 
   // Macro names start with [a-zA-Z_].
   const char *Next = Str.begin();
-  if (*Next != '_' && !isalpha(*Next))
+  if (!isValidIDChar(*Next, /*First=*/true))
     return Next;
   // Eat the first character of the name.
   ++Next;
 
   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
   const char *End = Str.end();
-  while (Next != End && (isalpha(*Next) || isdigit(*Next) || *Next == '_'))
+  while (Next != End && isValidIDChar(*Next, /*First=*/false))
     ++Next;
   return Next;
 }
@@ -173,7 +184,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
   switch (CurChar) {
   default:
     // Handle letters: [a-zA-Z_]
-    if (isalpha(CurChar) || CurChar == '_')
+    if (isValidIDChar(CurChar, /*First=*/true))
       return LexIdentifier();
 
     // Unknown character, emit an error.
@@ -250,14 +261,14 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
   case '0': case '1': case '2': case '3': case '4': case '5': case '6':
   case '7': case '8': case '9': {
     int NextChar = 0;
-    if (isdigit(CurChar)) {
+    if (isDigit(CurChar)) {
       // Allow identifiers to start with a number if it is followed by
       // an identifier.  This can happen with paste operations like
       // foo#8i.
       int i = 0;
       do {
         NextChar = peekNextChar(i++);
-      } while (isdigit(NextChar));
+      } while (isDigit(NextChar));
 
       if (NextChar == 'x' || NextChar == 'b') {
         // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
@@ -281,7 +292,7 @@ tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
       }
     }
 
-    if (isalpha(NextChar) || NextChar == '_')
+    if (isValidIDChar(NextChar, /*First=*/true))
       return LexIdentifier();
 
     return LexNumber();
@@ -347,13 +358,13 @@ tgtok::TokKind TGLexer::LexString() {
 }
 
 tgtok::TokKind TGLexer::LexVarName() {
-  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
+  if (!isValidIDChar(CurPtr[0], /*First=*/true))
     return ReturnError(TokStart, "Invalid variable name");
 
   // Otherwise, we're ok, consume the rest of the characters.
   const char *VarNameStart = CurPtr++;
 
-  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
+  while (isValidIDChar(*CurPtr, /*First=*/false))
     ++CurPtr;
 
   CurStrVal.assign(VarNameStart, CurPtr);
@@ -365,7 +376,7 @@ tgtok::TokKind TGLexer::LexIdentifier() {
   const char *IdentStart = TokStart;
 
   // Match the rest of the identifier regex: [0-9a-zA-Z_]*
-  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
+  while (isValidIDChar(*CurPtr, /*First=*/false))
     ++CurPtr;
 
   // Check to see if this identifier is a reserved keyword.
@@ -500,7 +511,7 @@ tgtok::TokKind TGLexer::LexNumber() {
       Base = 16;
       do
         ++CurPtr;
-      while (isxdigit(CurPtr[0]));
+      while (isHexDigit(CurPtr[0]));
     } else if (CurPtr[0] == 'b') {
       Base = 2;
       do
@@ -515,7 +526,7 @@ tgtok::TokKind TGLexer::LexNumber() {
   // Check if it's a decimal value.
   if (Base == 0) {
     // Check for a sign without a digit.
-    if (!isdigit(CurPtr[0])) {
+    if (!isDigit(CurPtr[0])) {
       if (CurPtr[-1] == '-')
         return tgtok::minus;
       else if (CurPtr[-1] == '+')
@@ -526,7 +537,7 @@ tgtok::TokKind TGLexer::LexNumber() {
     NumStart = TokStart;
     IsMinus = CurPtr[-1] == '-';
 
-    while (isdigit(CurPtr[0]))
+    while (isDigit(CurPtr[0]))
       ++CurPtr;
   }
 
@@ -574,11 +585,11 @@ tgtok::TokKind TGLexer::LexBracket() {
 
 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
 tgtok::TokKind TGLexer::LexExclaim() {
-  if (!isalpha(*CurPtr))
+  if (!isAlpha(*CurPtr))
     return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
 
   const char *Start = CurPtr++;
-  while (isalpha(*CurPtr))
+  while (isAlpha(*CurPtr))
     ++CurPtr;
 
   // Check to see which operator this is.