[clang] Perf/lexer faster slow get char and size (PR #70543)

Sat Oct 28 01:19:09 PDT 2023

https://github.com/serge-sans-paille updated https://github.com/llvm/llvm-project/pull/70543

>From c9d34dae319de3eed1a23c23ff7b6d7673a04b79 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Fri, 27 Oct 2023 22:48:08 +0200
Subject: [PATCH] [clang] Change GetCharAndSizeSlow interface to by-value style

Instead of passing the Size by reference, assuming it is initialized,
return it along the expected char result as an std::pair.

This makes the interface less error prone: previous interface expected
the Size reference to be initialized, and it was often forgotten,
leading to uninitialized variable usage. This patch fixes the issue.

This also generates faster code, as the returned pair (a char and an
unsigned) fits in 64 bits. The speedup according to compile time tracker
reach -O.7%, with a good number of -0.4%. Details are available on

        https://llvm-compile-time-tracker.com/compare.php?from=3fe63f81fcb999681daa11b2890c82fda3aaeef5&to=fc76a9202f737472ecad4d6e0b0bf87a013866f3&stat=instructions:u

And icing on the cake, on my setup it also shaves 2kB out of
libclang-cpp :-)
---
 clang/include/clang/Lex/Lexer.h               | 30 +++----
 clang/lib/Lex/DependencyDirectivesScanner.cpp |  7 +-
 clang/lib/Lex/Lexer.cpp                       | 80 +++++++++++--------
 3 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ac0ef14c591bdd7..9565fbd0da7feb3 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -577,17 +577,15 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
   /// emit a warning.
-  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
-                                          const LangOptions &LangOpts) {
+  static inline std::pair<char, unsigned>
+  getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts) {
     // If this is not a trigraph and not a UCN or escaped newline, return
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) {
-      Size = 1;
-      return *Ptr;
+      return std::make_pair(*Ptr, 1u);
     }
 
-    Size = 0;
-    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+    return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
   }
 
   /// Returns the leading whitespace for line that corresponds to the given
@@ -665,8 +663,7 @@ class Lexer : public PreprocessorLexer {
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
-    unsigned Size = 0;
-    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
+    auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
     Ptr += Size;
     return C;
   }
@@ -682,9 +679,7 @@ class Lexer : public PreprocessorLexer {
 
     // Otherwise, re-lex the character with a current token, allowing
     // diagnostics to be emitted and flags to be set.
-    Size = 0;
-    getCharAndSizeSlow(Ptr, Size, &Tok);
-    return Ptr+Size;
+    return Ptr + getCharAndSizeSlow(Ptr, &Tok).second;
   }
 
   /// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -699,14 +694,15 @@ class Lexer : public PreprocessorLexer {
       return *Ptr;
     }
 
-    Size = 0;
-    return getCharAndSizeSlow(Ptr, Size);
+    auto CharAndSize = getCharAndSizeSlow(Ptr);
+    Size = CharAndSize.second;
+    return CharAndSize.first;
   }
 
   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
   /// method.
-  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                          Token *Tok = nullptr);
+  std::pair<char, unsigned> getCharAndSizeSlow(const char *Ptr,
+                                               Token *Tok = nullptr);
 
   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -720,8 +716,8 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
   /// diagnostic.
-  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                       const LangOptions &LangOpts);
+  static std::pair<char, unsigned>
+  getCharAndSizeSlowNoWarn(const char *Ptr, const LangOptions &LangOpts);
 
   //===--------------------------------------------------------------------===//
   // Other lexer functions.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 2bd2c5f8388c0dd..f32b0f6767b9319 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,10 +565,9 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    unsigned Size;
-    Spelling[SpellingLength++] =
-        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[SpellingLength++] = CharAndSize.first;
+    BufPtr += CharAndSize.second;
   }
 
   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 675ec28e514797e..cee5c49a3c72e18 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   if (tok::isStringLiteral(Tok.getKind())) {
     // Munch the encoding-prefix and opening double-quote.
     while (BufPtr < BufEnd) {
-      unsigned Size;
-      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-      BufPtr += Size;
+      auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+      Spelling[Length++] = CharAndSize.first;
+      BufPtr += CharAndSize.second;
 
       if (Spelling[Length - 1] == '"')
         break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   }
 
   while (BufPtr < BufEnd) {
-    unsigned Size;
-    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[Length++] = CharAndSize.first;
+    BufPtr += CharAndSize.second;
   }
 
   assert(Length < Tok.getLength() &&
@@ -772,10 +772,9 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
-    unsigned Size;
-    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
-    TokPtr += Size;
-    PhysOffset += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
+    TokPtr += CharAndSize.second;
+    PhysOffset += CharAndSize.second;
   }
 
   // Final detail: if we end up on an escaped newline, we want to return the
@@ -1357,15 +1356,17 @@ SourceLocation Lexer::findLocationAfterToken(
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                               Token *Tok) {
+std::pair<char, unsigned> Lexer::getCharAndSizeSlow(const char *Ptr,
+                                                    Token *Tok) {
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return std::make_pair('\\', Size);
 
     // See if we have optional whitespace characters between the slash and
     // newline.
@@ -1382,11 +1383,13 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlow(Ptr, Size, Tok);
+      auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
+      CharAndSize.second += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return std::make_pair('\\', Size);
   }
 
   // If this is a trigraph, process it.
@@ -1401,13 +1404,12 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return std::make_pair(C, Size);
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return std::make_pair(*Ptr, Size + 1);
 }
 
 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1416,15 +1418,18 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                     const LangOptions &LangOpts) {
+std::pair<char, unsigned>
+Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, const LangOptions &LangOpts) {
+
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return std::make_pair('\\', Size);
 
     // See if we have optional whitespace characters followed by a newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1433,11 +1438,13 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+      auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+      CharAndSize.second += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return std::make_pair('\\', Size);
   }
 
   // If this is a trigraph, process it.
@@ -1448,13 +1455,12 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return std::make_pair(C, Size);
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return std::make_pair(*Ptr, Size + 1);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1964,11 +1970,14 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
 /// isHexaLiteral - Return true if Start points to a hex constant.
 /// in microsoft mode (where this is supposed to be several different tokens).
 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
-  unsigned Size;
-  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
+  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
+  char C1 = CharAndSize1.first;
   if (C1 != '0')
     return false;
-  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
+
+  auto CharAndSize2 =
+      Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.second, LangOpts);
+  char C2 = CharAndSize2.first;
   return (C2 == 'x' || C2 == 'X');
 }
 
@@ -2012,15 +2021,15 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    unsigned NextSize;
-    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
+    auto NextCharAndSize = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
+    char Next = NextCharAndSize.first;
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
                          ? diag::warn_cxx11_compat_digit_separator
                          : diag::warn_c23_compat_digit_separator);
       CurPtr = ConsumeChar(CurPtr, Size, Result);
-      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
+      CurPtr = ConsumeChar(CurPtr, NextCharAndSize.second, Result);
       return LexNumericConstant(Result, CurPtr);
     }
   }
@@ -2085,8 +2094,9 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        unsigned NextSize;
-        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
+        auto NextCharAndSize =
+            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
+        char Next = NextCharAndSize.first;
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);
@@ -2100,7 +2110,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
           break;
 
         Buffer[Chars++] = Next;
-        Consumed += NextSize;
+        Consumed += NextCharAndSize.second;
       }
     }