[clang] Perf/lexer faster slow get char and size (PR #70543)

Sat Oct 28 23:50:59 PDT 2023

https://github.com/serge-sans-paille updated https://github.com/llvm/llvm-project/pull/70543

>From 7bbcabd82edc1736cc22243e109dc0858036c6ac Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Fri, 27 Oct 2023 22:48:08 +0200
Subject: [PATCH 1/2] [clang] Change GetCharAndSizeSlow interface to by-value
 style

Instead of passing the Size by reference, assuming it is initialized,
return it alongside the expected char result as a POD.

This makes the interface less error prone: previous interface expected
the Size reference to be initialized, and it was often forgotten,
leading to uninitialized variable usage. This patch fixes the issue.

This also generates faster code, as the returned POD (a char and an
unsigned) fits in 64 bits. The speedup according to compile time tracker
reach -O.7%, with a good number of -0.4%. Details are available on

        https://llvm-compile-time-tracker.com/compare.php?from=3fe63f81fcb999681daa11b2890c82fda3aaeef5&to=fc76a9202f737472ecad4d6e0b0bf87a013866f3&stat=instructions:u

And icing on the cake, on my setup it also shaves 2kB out of
libclang-cpp :-)
---
 clang/include/clang/Lex/Lexer.h               | 35 ++++----
 clang/lib/Lex/DependencyDirectivesScanner.cpp |  7 +-
 clang/lib/Lex/Lexer.cpp                       | 79 +++++++++++--------
 3 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ac0ef14c591bdd7..899e665e7454652 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -575,19 +575,23 @@ class Lexer : public PreprocessorLexer {
   /// sequence.
   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
 
+  /// Represents a char and the number of bytes parsed to produce it.
+  struct SizedChar {
+    char Char;
+    unsigned Size;
+  };
+
   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
   /// emit a warning.
-  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
-                                          const LangOptions &LangOpts) {
+  static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
+                                               const LangOptions &LangOpts) {
     // If this is not a trigraph and not a UCN or escaped newline, return
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) {
-      Size = 1;
-      return *Ptr;
+      return {*Ptr, 1u};
     }
 
-    Size = 0;
-    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+    return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
   }
 
   /// Returns the leading whitespace for line that corresponds to the given
@@ -665,8 +669,7 @@ class Lexer : public PreprocessorLexer {
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
-    unsigned Size = 0;
-    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
+    auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
     Ptr += Size;
     return C;
   }
@@ -682,9 +685,7 @@ class Lexer : public PreprocessorLexer {
 
     // Otherwise, re-lex the character with a current token, allowing
     // diagnostics to be emitted and flags to be set.
-    Size = 0;
-    getCharAndSizeSlow(Ptr, Size, &Tok);
-    return Ptr+Size;
+    return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
   }
 
   /// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -699,14 +700,14 @@ class Lexer : public PreprocessorLexer {
       return *Ptr;
     }
 
-    Size = 0;
-    return getCharAndSizeSlow(Ptr, Size);
+    auto CharAndSize = getCharAndSizeSlow(Ptr);
+    Size = CharAndSize.Size;
+    return CharAndSize.Char;
   }
 
   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
   /// method.
-  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                          Token *Tok = nullptr);
+  SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
 
   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -720,8 +721,8 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
   /// diagnostic.
-  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                       const LangOptions &LangOpts);
+  static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
+                                            const LangOptions &LangOpts);
 
   //===--------------------------------------------------------------------===//
   // Other lexer functions.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 2bd2c5f8388c0dd..42b89f98a24f3e1 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,10 +565,9 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    unsigned Size;
-    Spelling[SpellingLength++] =
-        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[SpellingLength++] = CharAndSize.Char;
+    BufPtr += CharAndSize.Size;
   }
 
   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 675ec28e514797e..7a2b399f3d0aa30 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   if (tok::isStringLiteral(Tok.getKind())) {
     // Munch the encoding-prefix and opening double-quote.
     while (BufPtr < BufEnd) {
-      unsigned Size;
-      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-      BufPtr += Size;
+      auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+      Spelling[Length++] = CharAndSize.Char;
+      BufPtr += CharAndSize.Size;
 
       if (Spelling[Length - 1] == '"')
         break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   }
 
   while (BufPtr < BufEnd) {
-    unsigned Size;
-    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[Length++] = CharAndSize.Char;
+    BufPtr += CharAndSize.Size;
   }
 
   assert(Length < Tok.getLength() &&
@@ -772,10 +772,9 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
-    unsigned Size;
-    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
-    TokPtr += Size;
-    PhysOffset += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
+    TokPtr += CharAndSize.Size;
+    PhysOffset += CharAndSize.Size;
   }
 
   // Final detail: if we end up on an escaped newline, we want to return the
@@ -1357,15 +1356,16 @@ SourceLocation Lexer::findLocationAfterToken(
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                               Token *Tok) {
+Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return {'\\', Size};
 
     // See if we have optional whitespace characters between the slash and
     // newline.
@@ -1382,11 +1382,13 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlow(Ptr, Size, Tok);
+      auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
+      CharAndSize.Size += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return {'\\', Size};
   }
 
   // If this is a trigraph, process it.
@@ -1401,13 +1403,12 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return {C, Size};
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return {*Ptr, Size + 1u};
 }
 
 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1416,15 +1417,18 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                     const LangOptions &LangOpts) {
+Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
+                                                 const LangOptions &LangOpts) {
+
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return {'\\', Size};
 
     // See if we have optional whitespace characters followed by a newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1433,11 +1437,13 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+      auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+      CharAndSize.Size += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return {'\\', Size};
   }
 
   // If this is a trigraph, process it.
@@ -1448,13 +1454,12 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return {C, Size};
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return {*Ptr, Size + 1u};
 }
 
 //===----------------------------------------------------------------------===//
@@ -1964,11 +1969,14 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
 /// isHexaLiteral - Return true if Start points to a hex constant.
 /// in microsoft mode (where this is supposed to be several different tokens).
 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
-  unsigned Size;
-  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
+  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
+  char C1 = CharAndSize1.Char;
   if (C1 != '0')
     return false;
-  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
+
+  auto CharAndSize2 =
+      Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
+  char C2 = CharAndSize2.Char;
   return (C2 == 'x' || C2 == 'X');
 }
 
@@ -2012,15 +2020,15 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    unsigned NextSize;
-    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
+    auto NextCharAndSize = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
+    char Next = NextCharAndSize.Char;
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
                          ? diag::warn_cxx11_compat_digit_separator
                          : diag::warn_c23_compat_digit_separator);
       CurPtr = ConsumeChar(CurPtr, Size, Result);
-      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
+      CurPtr = ConsumeChar(CurPtr, NextCharAndSize.Size, Result);
       return LexNumericConstant(Result, CurPtr);
     }
   }
@@ -2085,8 +2093,9 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        unsigned NextSize;
-        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
+        auto NextCharAndSize =
+            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
+        char Next = NextCharAndSize.Char;
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);
@@ -2100,7 +2109,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
           break;
 
         Buffer[Chars++] = Next;
-        Consumed += NextSize;
+        Consumed += NextCharAndSize.Size;
       }
     }
 

>From a20790e9e87d2ede14605b277184d2b8c5b0979f Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Sun, 29 Oct 2023 07:50:27 +0100
Subject: [PATCH 2/2] fixup! [clang] Change GetCharAndSizeSlow interface to
 by-value style

---
 clang/lib/Lex/DependencyDirectivesScanner.cpp |  6 +++---
 clang/lib/Lex/Lexer.cpp                       | 11 ++++-------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 42b89f98a24f3e1..980f865cf24c97e 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,9 +565,9 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
-    Spelling[SpellingLength++] = CharAndSize.Char;
-    BufPtr += CharAndSize.Size;
+    auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[SpellingLength++] = Char;
+    BufPtr += Size;
   }
 
   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 7a2b399f3d0aa30..26c7755fb6fd383 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -2020,15 +2020,14 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    auto NextCharAndSize = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
-    char Next = NextCharAndSize.Char;
+    auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
                          ? diag::warn_cxx11_compat_digit_separator
                          : diag::warn_c23_compat_digit_separator);
       CurPtr = ConsumeChar(CurPtr, Size, Result);
-      CurPtr = ConsumeChar(CurPtr, NextCharAndSize.Size, Result);
+      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
       return LexNumericConstant(Result, CurPtr);
     }
   }
@@ -2093,9 +2092,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        auto NextCharAndSize =
-            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
-        char Next = NextCharAndSize.Char;
+        auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);
@@ -2109,7 +2106,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
           break;
 
         Buffer[Chars++] = Next;
-        Consumed += NextCharAndSize.Size;
+        Consumed += NextSize;
       }
     }