[clang] 1c876ff - Revert "Perf/lexer faster slow get char and size (#70543)"

Sun Oct 29 18:12:01 PDT 2023

Author: Nico Weber
Date: 2023-10-29T21:11:39-04:00
New Revision: 1c876ff5155c4feeb2b2885eb3e6abda17c4b7f4

URL: https://github.com/llvm/llvm-project/commit/1c876ff5155c4feeb2b2885eb3e6abda17c4b7f4
DIFF: https://github.com/llvm/llvm-project/commit/1c876ff5155c4feeb2b2885eb3e6abda17c4b7f4.diff

LOG: Revert "Perf/lexer faster slow get char and size (#70543)"

This reverts commit d8f5a18b6e587aeaa8b99707e87b652f49b160cd.
Breaks build, see:
https://github.com/llvm/llvm-project/pull/70543#issuecomment-1784227421

Added: 
    

Modified: 
    clang/include/clang/Lex/Lexer.h
    clang/lib/Lex/DependencyDirectivesScanner.cpp
    clang/lib/Lex/Lexer.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index 899e665e7454652..ac0ef14c591bdd7 100644

--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -575,23 +575,19 @@ class Lexer : public PreprocessorLexer {
   /// sequence.
   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
 
-  /// Represents a char and the number of bytes parsed to produce it.
-  struct SizedChar {
-    char Char;
-    unsigned Size;
-  };
-
   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
   /// emit a warning.
-  static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
-                                               const LangOptions &LangOpts) {
+  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
+                                          const LangOptions &LangOpts) {
     // If this is not a trigraph and not a UCN or escaped newline, return
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) {
-      return {*Ptr, 1u};
+      Size = 1;
+      return *Ptr;
     }
 
-    return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+    Size = 0;
+    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
   }
 
   /// Returns the leading whitespace for line that corresponds to the given
@@ -669,7 +665,8 @@ class Lexer : public PreprocessorLexer {
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
-    auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
+    unsigned Size = 0;
+    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
     Ptr += Size;
     return C;
   }
@@ -685,7 +682,9 @@ class Lexer : public PreprocessorLexer {
 
     // Otherwise, re-lex the character with a current token, allowing
     // diagnostics to be emitted and flags to be set.
-    return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
+    Size = 0;
+    getCharAndSizeSlow(Ptr, Size, &Tok);
+    return Ptr+Size;
   }
 
   /// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -700,14 +699,14 @@ class Lexer : public PreprocessorLexer {
       return *Ptr;
     }
 
-    auto CharAndSize = getCharAndSizeSlow(Ptr);
-    Size = CharAndSize.Size;
-    return CharAndSize.Char;
+    Size = 0;
+    return getCharAndSizeSlow(Ptr, Size);
   }
 
   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
   /// method.
-  SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
+  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
+                          Token *Tok = nullptr);
 
   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -721,8 +720,8 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
   /// diagnostic.
-  static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
-                                            const LangOptions &LangOpts);
+  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
+                                       const LangOptions &LangOpts);
 
   //===--------------------------------------------------------------------===//
   // Other lexer functions.

diff  --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 980f865cf24c97e..2bd2c5f8388c0dd 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,8 +565,9 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
-    Spelling[SpellingLength++] = Char;
+    unsigned Size;
+    Spelling[SpellingLength++] =
+        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
     BufPtr += Size;
   }
 

diff  --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 1c53997527732a9..675ec28e514797e 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   if (tok::isStringLiteral(Tok.getKind())) {
     // Munch the encoding-prefix and opening double-quote.
     while (BufPtr < BufEnd) {
-      auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
-      Spelling[Length++] = CharAndSize.Char;
-      BufPtr += CharAndSize.Size;
+      unsigned Size;
+      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+      BufPtr += Size;
 
       if (Spelling[Length - 1] == '"')
         break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   }
 
   while (BufPtr < BufEnd) {
-    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
-    Spelling[Length++] = CharAndSize.Char;
-    BufPtr += CharAndSize.Size;
+    unsigned Size;
+    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    BufPtr += Size;
   }
 
   assert(Length < Tok.getLength() &&
@@ -772,9 +772,10 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
-    auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
-    TokPtr += CharAndSize.Size;
-    PhysOffset += CharAndSize.Size;
+    unsigned Size;
+    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
+    TokPtr += Size;
+    PhysOffset += Size;
   }
 
   // Final detail: if we end up on an escaped newline, we want to return the
@@ -1356,16 +1357,15 @@ SourceLocation Lexer::findLocationAfterToken(
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
-  unsigned Size = 0;
+char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
+                               Token *Tok) {
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0]))
-      return {'\\', Size};
+    if (!isWhitespace(Ptr[0])) return '\\';
 
     // See if we have optional whitespace characters between the slash and
     // newline.
@@ -1382,13 +1382,11 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
-      CharAndSize.Size += Size;
-      return CharAndSize;
+      return getCharAndSizeSlow(Ptr, Size, Tok);
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return {'\\', Size};
+    return '\\';
   }
 
   // If this is a trigraph, process it.
@@ -1403,12 +1401,13 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return {C, Size};
+      return C;
     }
   }
 
   // If this is neither, return a single character.
-  return {*Ptr, Size + 1u};
+  ++Size;
+  return *Ptr;
 }
 
 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1417,18 +1416,15 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
 /// be updated to match.
-Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
-                                                 const LangOptions &LangOpts) {
-
-  unsigned Size = 0;
+char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
+                                     const LangOptions &LangOpts) {
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0]))
-      return {'\\', Size};
+    if (!isWhitespace(Ptr[0])) return '\\';
 
     // See if we have optional whitespace characters followed by a newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1437,13 +1433,11 @@ Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
-      CharAndSize.Size += Size;
-      return CharAndSize;
+      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return {'\\', Size};
+    return '\\';
   }
 
   // If this is a trigraph, process it.
@@ -1454,12 +1448,13 @@ Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return {C, Size};
+      return C;
     }
   }
 
   // If this is neither, return a single character.
-  return {*Ptr, Size + 1u};
+  ++Size;
+  return *Ptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1969,14 +1964,11 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
 /// isHexaLiteral - Return true if Start points to a hex constant.
 /// in microsoft mode (where this is supposed to be several 
diff erent tokens).
 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
-  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
-  char C1 = CharAndSize1.Char;
+  unsigned Size;
+  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
   if (C1 != '0')
     return false;
-
-  auto CharAndSize2 =
-      Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
-  char C2 = CharAndSize2.Char;
+  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
   return (C2 == 'x' || C2 == 'X');
 }
 
@@ -2020,7 +2012,8 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
+    unsigned NextSize;
+    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
@@ -2092,8 +2085,8 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        auto [Next, NextSize] =
-            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
+        unsigned NextSize;
+        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);