[clang] Perf/lexer faster slow get char and size (PR #70543)

Sat Oct 28 01:18:42 PDT 2023

https://github.com/serge-sans-paille created https://github.com/llvm/llvm-project/pull/70543

None

>From 3fe63f81fcb999681daa11b2890c82fda3aaeef5 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Thu, 26 Oct 2023 22:31:43 +0200
Subject: [PATCH 1/2] [clang] Change representation of CurLexerKind

Previous representation used an enumeration combined to a switch to
dispatch to the appropriate lexer.

Use function pointer so that the dispatching is just an indirect call,
which is actually better because lexing is a costly task compared to a
function call.

This also makes the code slightly cleaner, speedup on
compile time tracker are consistent and range form -0.05% to -0.20%
for NewPM-O0-g, see

        https://llvm-compile-time-tracker.com/compare.php?from=f9906508bc4f05d3950e2219b4c56f6c078a61ef&to=608c85ec1283638db949d73e062bcc3355001ce4&stat=instructions:u

Considering just the preprocessing task, preprocessing the sqlite
amalgametion takes -0.6% instructions (according to valgrind
--tool=callgrind)
---
 clang/include/clang/Lex/Preprocessor.h    | 46 +++++++++++-----
 clang/lib/Lex/PPCaching.cpp               |  8 +--
 clang/lib/Lex/PPLexerChange.cpp           | 20 +++----
 clang/lib/Lex/Preprocessor.cpp            | 67 ++++++-----------------
 clang/utils/ClangVisualizers/clang.natvis |  2 +-
 5 files changed, 62 insertions(+), 81 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 18d88407ae12c90..634d3924aa2248b 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -751,13 +751,8 @@ class Preprocessor {
   std::unique_ptr<TokenLexer> CurTokenLexer;
 
   /// The kind of lexer we're currently working with.
-  enum CurLexerKind {
-    CLK_Lexer,
-    CLK_TokenLexer,
-    CLK_CachingLexer,
-    CLK_DependencyDirectivesLexer,
-    CLK_LexAfterModuleImport
-  } CurLexerKind = CLK_Lexer;
+  typedef bool (*LexerCallback)(Preprocessor &, Token &);
+  LexerCallback CurLexerCallback = &CLK_Lexer;
 
   /// If the current lexer is for a submodule that is being built, this
   /// is that submodule.
@@ -767,7 +762,7 @@ class Preprocessor {
   /// \#included, and macros currently being expanded from, not counting
   /// CurLexer/CurTokenLexer.
   struct IncludeStackInfo {
-    enum CurLexerKind           CurLexerKind;
+    LexerCallback CurLexerCallback;
     Module                     *TheSubmodule;
     std::unique_ptr<Lexer>      TheLexer;
     PreprocessorLexer          *ThePPLexer;
@@ -776,12 +771,12 @@ class Preprocessor {
 
     // The following constructors are completely useless copies of the default
     // versions, only needed to pacify MSVC.
-    IncludeStackInfo(enum CurLexerKind CurLexerKind, Module *TheSubmodule,
+    IncludeStackInfo(LexerCallback CurLexerCallback, Module *TheSubmodule,
                      std::unique_ptr<Lexer> &&TheLexer,
                      PreprocessorLexer *ThePPLexer,
                      std::unique_ptr<TokenLexer> &&TheTokenLexer,
                      ConstSearchDirIterator TheDirLookup)
-        : CurLexerKind(std::move(CurLexerKind)),
+        : CurLexerCallback(std::move(CurLexerCallback)),
           TheSubmodule(std::move(TheSubmodule)), TheLexer(std::move(TheLexer)),
           ThePPLexer(std::move(ThePPLexer)),
           TheTokenLexer(std::move(TheTokenLexer)),
@@ -1901,7 +1896,7 @@ class Preprocessor {
   /// Determine whether it's possible for a future call to Lex to produce an
   /// annotation token created by a previous call to EnterAnnotationToken.
   bool mightHavePendingAnnotationTokens() {
-    return CurLexerKind != CLK_Lexer;
+    return CurLexerCallback != CLK_Lexer;
   }
 
   /// Update the current token to represent the provided
@@ -1914,7 +1909,7 @@ class Preprocessor {
 
   /// Recompute the current lexer kind based on the CurLexer/
   /// CurTokenLexer pointers.
-  void recomputeCurLexerKind();
+  void recomputeCurLexerCallback();
 
   /// Returns true if incremental processing is enabled
   bool isIncrementalProcessingEnabled() const { return IncrementalProcessing; }
@@ -2430,8 +2425,9 @@ class Preprocessor {
   friend void TokenLexer::ExpandFunctionArguments();
 
   void PushIncludeMacroStack() {
-    assert(CurLexerKind != CLK_CachingLexer && "cannot push a caching lexer");
-    IncludeMacroStack.emplace_back(CurLexerKind, CurLexerSubmodule,
+    assert(CurLexerCallback != CLK_CachingLexer &&
+           "cannot push a caching lexer");
+    IncludeMacroStack.emplace_back(CurLexerCallback, CurLexerSubmodule,
                                    std::move(CurLexer), CurPPLexer,
                                    std::move(CurTokenLexer), CurDirLookup);
     CurPPLexer = nullptr;
@@ -2443,7 +2439,7 @@ class Preprocessor {
     CurTokenLexer = std::move(IncludeMacroStack.back().TheTokenLexer);
     CurDirLookup  = IncludeMacroStack.back().TheDirLookup;
     CurLexerSubmodule = IncludeMacroStack.back().TheSubmodule;
-    CurLexerKind = IncludeMacroStack.back().CurLexerKind;
+    CurLexerCallback = IncludeMacroStack.back().CurLexerCallback;
     IncludeMacroStack.pop_back();
   }
 
@@ -2899,6 +2895,26 @@ class Preprocessor {
   /// \return true iff this PP is currently in a "-Wunsafe-buffer-usage"
   ///          opt-out region
   bool isPPInSafeBufferOptOutRegion(SourceLocation &StartLoc);
+
+private:
+  /// Helper functions to forward lexing to the actual lexer. They all share the
+  /// same signature.
+  static bool CLK_Lexer(Preprocessor &P, Token &Result) {
+    return P.CurLexer->Lex(Result);
+  }
+  static bool CLK_TokenLexer(Preprocessor &P, Token &Result) {
+    return P.CurTokenLexer->Lex(Result);
+  }
+  static bool CLK_CachingLexer(Preprocessor &P, Token &Result) {
+    P.CachingLex(Result);
+    return true;
+  }
+  static bool CLK_DependencyDirectivesLexer(Preprocessor &P, Token &Result) {
+    return P.CurLexer->LexDependencyDirectiveToken(Result);
+  }
+  static bool CLK_LexAfterModuleImport(Preprocessor &P, Token &Result) {
+    return P.LexAfterModuleImport(Result);
+  }
 };
 
 /// Abstract base class that describes a handler that will receive
diff --git a/clang/lib/Lex/PPCaching.cpp b/clang/lib/Lex/PPCaching.cpp
index e05e52ba9bb5362..b8219835a588bea 100644
--- a/clang/lib/Lex/PPCaching.cpp
+++ b/clang/lib/Lex/PPCaching.cpp
@@ -42,7 +42,7 @@ void Preprocessor::Backtrack() {
          && "EnableBacktrackAtThisPos was not called!");
   CachedLexPos = BacktrackPositions.back();
   BacktrackPositions.pop_back();
-  recomputeCurLexerKind();
+  recomputeCurLexerCallback();
 }
 
 void Preprocessor::CachingLex(Token &Result) {
@@ -88,7 +88,7 @@ void Preprocessor::EnterCachingLexMode() {
          "entered caching lex mode while lexing something else");
 
   if (InCachingLexMode()) {
-    assert(CurLexerKind == CLK_CachingLexer && "Unexpected lexer kind");
+    assert(CurLexerCallback == CLK_CachingLexer && "Unexpected lexer kind");
     return;
   }
 
@@ -96,9 +96,9 @@ void Preprocessor::EnterCachingLexMode() {
 }
 
 void Preprocessor::EnterCachingLexModeUnchecked() {
-  assert(CurLexerKind != CLK_CachingLexer && "already in caching lex mode");
+  assert(CurLexerCallback != CLK_CachingLexer && "already in caching lex mode");
   PushIncludeMacroStack();
-  CurLexerKind = CLK_CachingLexer;
+  CurLexerCallback = CLK_CachingLexer;
 }
 
 
diff --git a/clang/lib/Lex/PPLexerChange.cpp b/clang/lib/Lex/PPLexerChange.cpp
index b8575e1adfc5b3f..1378880f8b0e5e0 100644
--- a/clang/lib/Lex/PPLexerChange.cpp
+++ b/clang/lib/Lex/PPLexerChange.cpp
@@ -122,10 +122,10 @@ void Preprocessor::EnterSourceFileWithLexer(Lexer *TheLexer,
   CurPPLexer = TheLexer;
   CurDirLookup = CurDir;
   CurLexerSubmodule = nullptr;
-  if (CurLexerKind != CLK_LexAfterModuleImport)
-    CurLexerKind = TheLexer->isDependencyDirectivesLexer()
-                       ? CLK_DependencyDirectivesLexer
-                       : CLK_Lexer;
+  if (CurLexerCallback != CLK_LexAfterModuleImport)
+    CurLexerCallback = TheLexer->isDependencyDirectivesLexer()
+                           ? CLK_DependencyDirectivesLexer
+                           : CLK_Lexer;
 
   // Notify the client, if desired, that we are in a new source file.
   if (Callbacks && !CurLexer->Is_PragmaLexer) {
@@ -161,8 +161,8 @@ void Preprocessor::EnterMacro(Token &Tok, SourceLocation ILEnd,
   PushIncludeMacroStack();
   CurDirLookup = nullptr;
   CurTokenLexer = std::move(TokLexer);
-  if (CurLexerKind != CLK_LexAfterModuleImport)
-    CurLexerKind = CLK_TokenLexer;
+  if (CurLexerCallback != CLK_LexAfterModuleImport)
+    CurLexerCallback = CLK_TokenLexer;
 }
 
 /// EnterTokenStream - Add a "macro" context to the top of the include stack,
@@ -180,7 +180,7 @@ void Preprocessor::EnterMacro(Token &Tok, SourceLocation ILEnd,
 void Preprocessor::EnterTokenStream(const Token *Toks, unsigned NumToks,
                                     bool DisableMacroExpansion, bool OwnsTokens,
                                     bool IsReinject) {
-  if (CurLexerKind == CLK_CachingLexer) {
+  if (CurLexerCallback == CLK_CachingLexer) {
     if (CachedLexPos < CachedTokens.size()) {
       assert(IsReinject && "new tokens in the middle of cached stream");
       // We're entering tokens into the middle of our cached token stream. We
@@ -216,8 +216,8 @@ void Preprocessor::EnterTokenStream(const Token *Toks, unsigned NumToks,
   PushIncludeMacroStack();
   CurDirLookup = nullptr;
   CurTokenLexer = std::move(TokLexer);
-  if (CurLexerKind != CLK_LexAfterModuleImport)
-    CurLexerKind = CLK_TokenLexer;
+  if (CurLexerCallback != CLK_LexAfterModuleImport)
+    CurLexerCallback = CLK_TokenLexer;
 }
 
 /// Compute the relative path that names the given file relative to
@@ -452,7 +452,7 @@ bool Preprocessor::HandleEndOfFile(Token &Result, bool isEndOfMacro) {
       CurLexer.reset();
 
       CurPPLexer = nullptr;
-      recomputeCurLexerKind();
+      recomputeCurLexerCallback();
       return true;
     }
 
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index ede4c51487ffbe7..f791b4d2363c9fe 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -380,15 +380,15 @@ StringRef Preprocessor::getLastMacroWithSpelling(
   return BestSpelling;
 }
 
-void Preprocessor::recomputeCurLexerKind() {
+void Preprocessor::recomputeCurLexerCallback() {
   if (CurLexer)
-    CurLexerKind = CurLexer->isDependencyDirectivesLexer()
-                       ? CLK_DependencyDirectivesLexer
-                       : CLK_Lexer;
+    CurLexerCallback = CurLexer->isDependencyDirectivesLexer()
+                           ? CLK_DependencyDirectivesLexer
+                           : CLK_Lexer;
   else if (CurTokenLexer)
-    CurLexerKind = CLK_TokenLexer;
+    CurLexerCallback = CLK_TokenLexer;
   else
-    CurLexerKind = CLK_CachingLexer;
+    CurLexerCallback = CLK_CachingLexer;
 }
 
 bool Preprocessor::SetCodeCompletionPoint(FileEntryRef File,
@@ -643,23 +643,7 @@ void Preprocessor::SkipTokensWhileUsingPCH() {
   while (true) {
     bool InPredefines =
         (CurLexer && CurLexer->getFileID() == getPredefinesFileID());
-    switch (CurLexerKind) {
-    case CLK_Lexer:
-      CurLexer->Lex(Tok);
-     break;
-    case CLK_TokenLexer:
-      CurTokenLexer->Lex(Tok);
-      break;
-    case CLK_CachingLexer:
-      CachingLex(Tok);
-      break;
-    case CLK_DependencyDirectivesLexer:
-      CurLexer->LexDependencyDirectiveToken(Tok);
-      break;
-    case CLK_LexAfterModuleImport:
-      LexAfterModuleImport(Tok);
-      break;
-    }
+    CurLexerCallback(*this, Tok);
     if (Tok.is(tok::eof) && !InPredefines) {
       ReachedMainFileEOF = true;
       break;
@@ -868,12 +852,12 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
        Identifier.is(tok::kw_import)) &&
       !InMacroArgs && !DisableMacroExpansion &&
       (getLangOpts().Modules || getLangOpts().DebuggerSupport) &&
-      CurLexerKind != CLK_CachingLexer) {
+      CurLexerCallback != CLK_CachingLexer) {
     ModuleImportLoc = Identifier.getLocation();
     NamedModuleImportPath.clear();
     IsAtImport = true;
     ModuleImportExpectsIdentifier = true;
-    CurLexerKind = CLK_LexAfterModuleImport;
+    CurLexerCallback = CLK_LexAfterModuleImport;
   }
   return true;
 }
@@ -882,27 +866,8 @@ void Preprocessor::Lex(Token &Result) {
   ++LexLevel;
 
   // We loop here until a lex function returns a token; this avoids recursion.
-  bool ReturnedToken;
-  do {
-    switch (CurLexerKind) {
-    case CLK_Lexer:
-      ReturnedToken = CurLexer->Lex(Result);
-      break;
-    case CLK_TokenLexer:
-      ReturnedToken = CurTokenLexer->Lex(Result);
-      break;
-    case CLK_CachingLexer:
-      CachingLex(Result);
-      ReturnedToken = true;
-      break;
-    case CLK_DependencyDirectivesLexer:
-      ReturnedToken = CurLexer->LexDependencyDirectiveToken(Result);
-      break;
-    case CLK_LexAfterModuleImport:
-      ReturnedToken = LexAfterModuleImport(Result);
-      break;
-    }
-  } while (!ReturnedToken);
+  while (!CurLexerCallback(*this, Result))
+    ;
 
   if (Result.is(tok::unknown) && TheModuleLoader.HadFatalFailure)
     return;
@@ -965,7 +930,7 @@ void Preprocessor::Lex(Token &Result) {
           NamedModuleImportPath.clear();
           IsAtImport = false;
           ModuleImportExpectsIdentifier = true;
-          CurLexerKind = CLK_LexAfterModuleImport;
+          CurLexerCallback = CLK_LexAfterModuleImport;
         }
         break;
       } else if (Result.getIdentifierInfo() == getIdentifierInfo("module")) {
@@ -1166,7 +1131,7 @@ void Preprocessor::CollectPpImportSuffix(SmallVectorImpl<Token> &Toks) {
 /// We respond to a pp-import by importing macros from the named module.
 bool Preprocessor::LexAfterModuleImport(Token &Result) {
   // Figure out what kind of lexer we actually have.
-  recomputeCurLexerKind();
+  recomputeCurLexerCallback();
 
   // Lex the next token. The header-name lexing rules are used at the start of
   // a pp-import.
@@ -1183,7 +1148,7 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
       Name += ":";
       NamedModuleImportPath.push_back(
           {getIdentifierInfo(Name), Result.getLocation()});
-      CurLexerKind = CLK_LexAfterModuleImport;
+      CurLexerCallback = CLK_LexAfterModuleImport;
       return true;
     }
   } else {
@@ -1283,7 +1248,7 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
     NamedModuleImportPath.push_back(
         std::make_pair(Result.getIdentifierInfo(), Result.getLocation()));
     ModuleImportExpectsIdentifier = false;
-    CurLexerKind = CLK_LexAfterModuleImport;
+    CurLexerCallback = CLK_LexAfterModuleImport;
     return true;
   }
 
@@ -1292,7 +1257,7 @@ bool Preprocessor::LexAfterModuleImport(Token &Result) {
   // attribute-specifier-seq here under the Standard C++ Modules.)
   if (!ModuleImportExpectsIdentifier && Result.getKind() == tok::period) {
     ModuleImportExpectsIdentifier = true;
-    CurLexerKind = CLK_LexAfterModuleImport;
+    CurLexerCallback = CLK_LexAfterModuleImport;
     return true;
   }
 
diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis
index cbb63dc08de2338..44b43ec05d8af59 100644
--- a/clang/utils/ClangVisualizers/clang.natvis
+++ b/clang/utils/ClangVisualizers/clang.natvis
@@ -817,7 +817,7 @@ For later versions of Visual Studio, no setup is required-->
     <DisplayString IncludeView="cached"> {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na}</DisplayString>
     <DisplayString Condition="CurLexer._Mypair._Myval2 != 0">{CurLexer._Mypair._Myval2,na}</DisplayString>
     <DisplayString Condition="CurTokenLexer._Mypair._Myval2 != 0">Expanding Macro: {CurTokenLexer._Mypair._Myval2,na}</DisplayString>
-    <!-- Can't use CurLexerKind because natvis sees the type rather than the variable -->
+    <!-- Can't use CurLexerCallback because natvis sees the type rather than the variable -->
     <DisplayString Condition="IncludeMacroStack._Mypair._Myval2._Mylast - IncludeMacroStack._Mypair._Myval2._Myfirst">
       {this,view(cached)}
     </DisplayString>

>From 867ef93d35e5ecbe4dade43df716492c2cfba420 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton at mozilla.com>
Date: Fri, 27 Oct 2023 22:48:08 +0200
Subject: [PATCH 2/2] [clang] Change GetCharAndSizeSlow interface to by-value
 style

Instead of passing the Size by reference, assuming it is initialized,
return it along the expected char result as an std::pair.

This makes the interface less error prone: previous interface expected
the Size reference to be initialized, and it was often forgotten,
leading to uninitialized variable usage. This patch fixes the issue.

This also generates faster code, as the returned pair (a char and an
unsigned) fits in 64 bits. The speedup according to compile time tracker
reach -O.7%, with a good number of -0.4%. Details are available on

        https://llvm-compile-time-tracker.com/compare.php?from=3fe63f81fcb999681daa11b2890c82fda3aaeef5&to=fc76a9202f737472ecad4d6e0b0bf87a013866f3&stat=instructions:u

And icing on the cake, on my setup it also shaves 2kB out of
libclang-cpp :-)
---
 clang/include/clang/Lex/Lexer.h               | 30 +++----
 clang/lib/Lex/DependencyDirectivesScanner.cpp |  7 +-
 clang/lib/Lex/Lexer.cpp                       | 80 +++++++++++--------
 3 files changed, 61 insertions(+), 56 deletions(-)

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ac0ef14c591bdd7..9565fbd0da7feb3 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -577,17 +577,15 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
   /// emit a warning.
-  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
-                                          const LangOptions &LangOpts) {
+  static inline std::pair<char, unsigned>
+  getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts) {
     // If this is not a trigraph and not a UCN or escaped newline, return
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) {
-      Size = 1;
-      return *Ptr;
+      return std::make_pair(*Ptr, 1u);
     }
 
-    Size = 0;
-    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+    return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
   }
 
   /// Returns the leading whitespace for line that corresponds to the given
@@ -665,8 +663,7 @@ class Lexer : public PreprocessorLexer {
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
-    unsigned Size = 0;
-    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
+    auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
     Ptr += Size;
     return C;
   }
@@ -682,9 +679,7 @@ class Lexer : public PreprocessorLexer {
 
     // Otherwise, re-lex the character with a current token, allowing
     // diagnostics to be emitted and flags to be set.
-    Size = 0;
-    getCharAndSizeSlow(Ptr, Size, &Tok);
-    return Ptr+Size;
+    return Ptr + getCharAndSizeSlow(Ptr, &Tok).second;
   }
 
   /// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -699,14 +694,15 @@ class Lexer : public PreprocessorLexer {
       return *Ptr;
     }
 
-    Size = 0;
-    return getCharAndSizeSlow(Ptr, Size);
+    auto CharAndSize = getCharAndSizeSlow(Ptr);
+    Size = CharAndSize.second;
+    return CharAndSize.first;
   }
 
   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
   /// method.
-  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                          Token *Tok = nullptr);
+  std::pair<char, unsigned> getCharAndSizeSlow(const char *Ptr,
+                                               Token *Tok = nullptr);
 
   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -720,8 +716,8 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
   /// diagnostic.
-  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                       const LangOptions &LangOpts);
+  static std::pair<char, unsigned>
+  getCharAndSizeSlowNoWarn(const char *Ptr, const LangOptions &LangOpts);
 
   //===--------------------------------------------------------------------===//
   // Other lexer functions.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 2bd2c5f8388c0dd..f32b0f6767b9319 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,10 +565,9 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    unsigned Size;
-    Spelling[SpellingLength++] =
-        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[SpellingLength++] = CharAndSize.first;
+    BufPtr += CharAndSize.second;
   }
 
   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 675ec28e514797e..cee5c49a3c72e18 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   if (tok::isStringLiteral(Tok.getKind())) {
     // Munch the encoding-prefix and opening double-quote.
     while (BufPtr < BufEnd) {
-      unsigned Size;
-      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-      BufPtr += Size;
+      auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+      Spelling[Length++] = CharAndSize.first;
+      BufPtr += CharAndSize.second;
 
       if (Spelling[Length - 1] == '"')
         break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   }
 
   while (BufPtr < BufEnd) {
-    unsigned Size;
-    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[Length++] = CharAndSize.first;
+    BufPtr += CharAndSize.second;
   }
 
   assert(Length < Tok.getLength() &&
@@ -772,10 +772,9 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
-    unsigned Size;
-    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
-    TokPtr += Size;
-    PhysOffset += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
+    TokPtr += CharAndSize.second;
+    PhysOffset += CharAndSize.second;
   }
 
   // Final detail: if we end up on an escaped newline, we want to return the
@@ -1357,15 +1356,17 @@ SourceLocation Lexer::findLocationAfterToken(
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                               Token *Tok) {
+std::pair<char, unsigned> Lexer::getCharAndSizeSlow(const char *Ptr,
+                                                    Token *Tok) {
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return std::make_pair('\\', Size);
 
     // See if we have optional whitespace characters between the slash and
     // newline.
@@ -1382,11 +1383,13 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlow(Ptr, Size, Tok);
+      auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
+      CharAndSize.second += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return std::make_pair('\\', Size);
   }
 
   // If this is a trigraph, process it.
@@ -1401,13 +1404,12 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return std::make_pair(C, Size);
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return std::make_pair(*Ptr, Size + 1);
 }
 
 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1416,15 +1418,18 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                     const LangOptions &LangOpts) {
+std::pair<char, unsigned>
+Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, const LangOptions &LangOpts) {
+
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return std::make_pair('\\', Size);
 
     // See if we have optional whitespace characters followed by a newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1433,11 +1438,13 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+      auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+      CharAndSize.second += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return std::make_pair('\\', Size);
   }
 
   // If this is a trigraph, process it.
@@ -1448,13 +1455,12 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return std::make_pair(C, Size);
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return std::make_pair(*Ptr, Size + 1);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1964,11 +1970,14 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
 /// isHexaLiteral - Return true if Start points to a hex constant.
 /// in microsoft mode (where this is supposed to be several different tokens).
 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
-  unsigned Size;
-  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
+  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
+  char C1 = CharAndSize1.first;
   if (C1 != '0')
     return false;
-  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
+
+  auto CharAndSize2 =
+      Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.second, LangOpts);
+  char C2 = CharAndSize2.first;
   return (C2 == 'x' || C2 == 'X');
 }
 
@@ -2012,15 +2021,15 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    unsigned NextSize;
-    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
+    auto NextCharAndSize = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
+    char Next = NextCharAndSize.first;
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
                          ? diag::warn_cxx11_compat_digit_separator
                          : diag::warn_c23_compat_digit_separator);
       CurPtr = ConsumeChar(CurPtr, Size, Result);
-      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
+      CurPtr = ConsumeChar(CurPtr, NextCharAndSize.second, Result);
       return LexNumericConstant(Result, CurPtr);
     }
   }
@@ -2085,8 +2094,9 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        unsigned NextSize;
-        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
+        auto NextCharAndSize =
+            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
+        char Next = NextCharAndSize.first;
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);
@@ -2100,7 +2110,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
           break;
 
         Buffer[Chars++] = Next;
-        Consumed += NextSize;
+        Consumed += NextCharAndSize.second;
       }
     }