[clang] [clang][Diagnostics] Highlight code snippets (PR #66514)

Timm Baeder via cfe-commits cfe-commits at lists.llvm.org
Tue Jan 16 07:59:50 PST 2024


Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>,
Timm =?utf-8?q?Bäder?= <tbaeder at redhat.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/66514 at github.com>


https://github.com/tbaederr updated https://github.com/llvm/llvm-project/pull/66514

>From 841ce2be3823a24ea39702076589411c5299686b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Fri, 15 Sep 2023 15:51:39 +0200
Subject: [PATCH 01/35] [clang][Diagnostics] Highlight code snippets

Add some primitive syntax highlighting to our code snippet output.
---
 .../clang/Frontend/CodeSnippetHighlighter.h   |  46 +++++++
 clang/include/clang/Frontend/TextDiagnostic.h |   2 +
 clang/lib/Frontend/CMakeLists.txt             |   1 +
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 120 ++++++++++++++++++
 clang/lib/Frontend/TextDiagnostic.cpp         |  26 ++++
 5 files changed, 195 insertions(+)
 create mode 100644 clang/include/clang/Frontend/CodeSnippetHighlighter.h
 create mode 100644 clang/lib/Frontend/CodeSnippetHighlighter.cpp

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
new file mode 100644
index 00000000000000..776954b59e2e1a
--- /dev/null
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -0,0 +1,46 @@
+//===--- CodeSnippetHighlighter.h - Code snippet highlighting ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_FRONTEND_CODESNIPPETHIGHLIGHTER_H
+#define LLVM_CLANG_FRONTEND_CODESNIPPETHIGHLIGHTER_H
+
+#include "clang/Basic/LangOptions.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+namespace clang {
+
+struct StyleRange {
+  unsigned Start;
+  unsigned End;
+  const enum llvm::raw_ostream::Colors c;
+};
+
+class CodeSnippetHighlighter final {
+public:
+  CodeSnippetHighlighter() = default;
+
+  /// Produce StyleRanges for the given line.
+  /// The returned vector contains non-overlapping style ranges. They are sorted
+  /// from beginning of the line to the end.
+  std::vector<StyleRange> highlightLine(llvm::StringRef SourceLine,
+                                        const LangOptions &LangOpts);
+
+private:
+  bool Initialized = false;
+  /// Fills Keywords and Literals.
+  void ensureTokenData();
+
+  llvm::SmallSet<StringRef, 12> Keywords;
+  llvm::SmallSet<StringRef, 12> Literals;
+};
+
+} // namespace clang
+
+#endif
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 7eb0ab0cdc9bca..59fd4d4f9408d4 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H
 #define LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H
 
+#include "clang/Frontend/CodeSnippetHighlighter.h"
 #include "clang/Frontend/DiagnosticRenderer.h"
 
 namespace clang {
@@ -33,6 +34,7 @@ namespace clang {
 /// printing coming out of libclang.
 class TextDiagnostic : public DiagnosticRenderer {
   raw_ostream &OS;
+  CodeSnippetHighlighter SnippetHighlighter;
 
 public:
   TextDiagnostic(raw_ostream &OS,
diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
index a9166672088459..db0dea04514b0c 100644
--- a/clang/lib/Frontend/CMakeLists.txt
+++ b/clang/lib/Frontend/CMakeLists.txt
@@ -42,6 +42,7 @@ add_clang_library(clangFrontend
   TextDiagnosticPrinter.cpp
   VerifyDiagnosticConsumer.cpp
   InterfaceStubFunctionsConsumer.cpp
+  CodeSnippetHighlighter.cpp
 
   DEPENDS
   ClangDriverOptions
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
new file mode 100644
index 00000000000000..829a533ad2692e
--- /dev/null
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -0,0 +1,120 @@
+
+#include "clang/Frontend/CodeSnippetHighlighter.h"
+#include "clang/Basic/DiagnosticOptions.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Lex/Lexer.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace clang;
+
+void CodeSnippetHighlighter::ensureTokenData() {
+  if (Initialized)
+    return;
+
+  // List of keywords, literals and types we want to highlight.
+  // These are best-effort, as is everything we do wrt. highlighting.
+  Keywords.insert("_Static_assert");
+  Keywords.insert("auto");
+  Keywords.insert("concept");
+  Keywords.insert("const");
+  Keywords.insert("consteval");
+  Keywords.insert("constexpr");
+  Keywords.insert("delete");
+  Keywords.insert("do");
+  Keywords.insert("else");
+  Keywords.insert("final");
+  Keywords.insert("for");
+  Keywords.insert("if");
+  Keywords.insert("mutable");
+  Keywords.insert("namespace");
+  Keywords.insert("new");
+  Keywords.insert("private");
+  Keywords.insert("public");
+  Keywords.insert("requires");
+  Keywords.insert("return");
+  Keywords.insert("static");
+  Keywords.insert("static_assert");
+  Keywords.insert("using");
+  Keywords.insert("void");
+  Keywords.insert("volatile");
+  Keywords.insert("while");
+
+  // Builtin types we highlight
+  Keywords.insert("void");
+  Keywords.insert("char");
+  Keywords.insert("short");
+  Keywords.insert("int");
+  Keywords.insert("unsigned");
+  Keywords.insert("long");
+  Keywords.insert("float");
+  Keywords.insert("double");
+
+  Literals.insert("true");
+  Literals.insert("false");
+  Literals.insert("nullptr");
+
+  Initialized = true;
+}
+
+static SourceManager createTempSourceManager() {
+  FileSystemOptions FileOpts;
+  FileManager FileMgr(FileOpts);
+  llvm::IntrusiveRefCntPtr<DiagnosticIDs> DiagIDs(new DiagnosticIDs());
+  llvm::IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts(new DiagnosticOptions());
+  DiagnosticsEngine diags(DiagIDs, DiagOpts);
+  return SourceManager(diags, FileMgr);
+}
+
+static Lexer createTempLexer(llvm::MemoryBufferRef B, SourceManager &FakeSM,
+                             const LangOptions &LangOpts) {
+  return Lexer(FakeSM.createFileID(B), B, FakeSM, LangOpts);
+}
+
+std::vector<StyleRange>
+CodeSnippetHighlighter::highlightLine(StringRef SourceLine,
+                                      const LangOptions &LangOpts) {
+  ensureTokenData();
+
+  constexpr raw_ostream::Colors CommentColor = raw_ostream::BLACK;
+  constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
+  constexpr raw_ostream::Colors KeywordColor = raw_ostream::YELLOW;
+
+  const auto MemBuf = llvm::MemoryBuffer::getMemBuffer(SourceLine);
+  SourceManager FakeSM = createTempSourceManager();
+  Lexer L = createTempLexer(MemBuf->getMemBufferRef(), FakeSM, LangOpts);
+  L.SetKeepWhitespaceMode(true);
+
+  std::vector<StyleRange> Styles;
+  bool Stop = false;
+  while (!Stop) {
+    Token tok;
+    Stop = L.LexFromRawLexer(tok);
+    if (tok.is(tok::unknown))
+      continue;
+
+    bool Invalid;
+    unsigned Start =
+        FakeSM.getSpellingColumnNumber(tok.getLocation(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+
+    if (tok.is(tok::raw_identifier)) {
+      // Almost everything we lex is an identifier, since we use a raw lexer.
+      // Some should be highlightes as literals, others as keywords.
+      if (Keywords.contains(tok.getRawIdentifier()))
+        Styles.push_back(
+            StyleRange{Start, Start + tok.getLength(), KeywordColor});
+      else if (Literals.contains(tok.getRawIdentifier()))
+        Styles.push_back(
+            StyleRange{Start, Start + tok.getLength(), LiteralColor});
+    } else if (tok::isLiteral(tok.getKind())) {
+      Styles.push_back(
+          StyleRange{Start, Start + tok.getLength(), LiteralColor});
+    } else if (tok.is(tok::comment)) {
+      Styles.push_back(
+          StyleRange{Start, Start + tok.getLength(), CommentColor});
+    }
+  }
+
+  return Styles;
+}
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 779dead5d058d1..13d7d1e048cf99 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -11,6 +11,7 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
+#include "clang/Frontend/CodeSnippetHighlighter.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
@@ -1278,6 +1279,9 @@ void TextDiagnostic::emitSnippetAndCaret(
 void TextDiagnostic::emitSnippet(StringRef SourceLine,
                                  unsigned MaxLineNoDisplayWidth,
                                  unsigned LineNo) {
+  std::vector<StyleRange> Styles =
+      SnippetHighlighter.highlightLine(SourceLine, LangOpts);
+
   // Emit line number.
   if (MaxLineNoDisplayWidth > 0) {
     unsigned LineNoDisplayWidth = getNumDisplayWidth(LineNo);
@@ -1287,11 +1291,33 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
 
   // Print the source line one character at a time.
   bool PrintReversed = false;
+  bool HighlightingEnabled = DiagOpts->ShowColors;
   size_t I = 0;
   while (I < SourceLine.size()) {
     auto [Str, WasPrintable] =
         printableTextForNextCharacter(SourceLine, &I, DiagOpts->TabStop);
 
+    // Just stop highlighting anything for this line if we found a non-printable
+    // character.
+    if (!WasPrintable)
+      HighlightingEnabled = false;
+
+    // FIXME: I hope we can do this in some nicer way.
+    if (HighlightingEnabled) {
+      std::optional<enum raw_ostream::Colors> H;
+      for (auto &P : Styles) {
+        if (P.Start < I && P.End >= I) {
+          H = P.c;
+          break;
+        }
+      }
+
+      if (H) {
+        OS.changeColor(*H, false);
+      } else
+        OS.resetColor();
+    }
+
     // Toggle inverted colors on or off for this character.
     if (DiagOpts->ShowColors) {
       if (WasPrintable == PrintReversed) {

>From cdd82e013b17038de6a2b80e4f6dc125785aa5e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 20 Sep 2023 15:28:10 +0200
Subject: [PATCH 02/35] Get identifier table from Preprocessor

---
 .../clang/Frontend/CodeSnippetHighlighter.h   |  11 +-
 clang/include/clang/Frontend/TextDiagnostic.h |   7 +-
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 101 +++++-------------
 clang/lib/Frontend/TextDiagnostic.cpp         |   8 +-
 clang/lib/Frontend/TextDiagnosticPrinter.cpp  |   2 +-
 5 files changed, 39 insertions(+), 90 deletions(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index 776954b59e2e1a..ec03375221f9ff 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -22,6 +22,8 @@ struct StyleRange {
   const enum llvm::raw_ostream::Colors c;
 };
 
+class Preprocessor;
+
 class CodeSnippetHighlighter final {
 public:
   CodeSnippetHighlighter() = default;
@@ -30,15 +32,8 @@ class CodeSnippetHighlighter final {
   /// The returned vector contains non-overlapping style ranges. They are sorted
   /// from beginning of the line to the end.
   std::vector<StyleRange> highlightLine(llvm::StringRef SourceLine,
+                                        const Preprocessor *PP,
                                         const LangOptions &LangOpts);
-
-private:
-  bool Initialized = false;
-  /// Fills Keywords and Literals.
-  void ensureTokenData();
-
-  llvm::SmallSet<StringRef, 12> Keywords;
-  llvm::SmallSet<StringRef, 12> Literals;
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 59fd4d4f9408d4..8cdb9b141a8a4a 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -19,7 +19,6 @@
 #include "clang/Frontend/DiagnosticRenderer.h"
 
 namespace clang {
-
 /// Class to encapsulate the logic for formatting and printing a textual
 /// diagnostic message.
 ///
@@ -34,12 +33,12 @@ namespace clang {
 /// printing coming out of libclang.
 class TextDiagnostic : public DiagnosticRenderer {
   raw_ostream &OS;
+  const Preprocessor *PP;
   CodeSnippetHighlighter SnippetHighlighter;
 
 public:
-  TextDiagnostic(raw_ostream &OS,
-                 const LangOptions &LangOpts,
-                 DiagnosticOptions *DiagOpts);
+  TextDiagnostic(raw_ostream &OS, const LangOptions &LangOpts,
+                 const Preprocessor *PP, DiagnosticOptions *DiagOpts);
 
   ~TextDiagnostic() override;
 
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 829a533ad2692e..63b3707fbb7ef8 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -3,59 +3,12 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/Preprocessor.h"
+#include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
 
-void CodeSnippetHighlighter::ensureTokenData() {
-  if (Initialized)
-    return;
-
-  // List of keywords, literals and types we want to highlight.
-  // These are best-effort, as is everything we do wrt. highlighting.
-  Keywords.insert("_Static_assert");
-  Keywords.insert("auto");
-  Keywords.insert("concept");
-  Keywords.insert("const");
-  Keywords.insert("consteval");
-  Keywords.insert("constexpr");
-  Keywords.insert("delete");
-  Keywords.insert("do");
-  Keywords.insert("else");
-  Keywords.insert("final");
-  Keywords.insert("for");
-  Keywords.insert("if");
-  Keywords.insert("mutable");
-  Keywords.insert("namespace");
-  Keywords.insert("new");
-  Keywords.insert("private");
-  Keywords.insert("public");
-  Keywords.insert("requires");
-  Keywords.insert("return");
-  Keywords.insert("static");
-  Keywords.insert("static_assert");
-  Keywords.insert("using");
-  Keywords.insert("void");
-  Keywords.insert("volatile");
-  Keywords.insert("while");
-
-  // Builtin types we highlight
-  Keywords.insert("void");
-  Keywords.insert("char");
-  Keywords.insert("short");
-  Keywords.insert("int");
-  Keywords.insert("unsigned");
-  Keywords.insert("long");
-  Keywords.insert("float");
-  Keywords.insert("double");
-
-  Literals.insert("true");
-  Literals.insert("false");
-  Literals.insert("nullptr");
-
-  Initialized = true;
-}
-
 static SourceManager createTempSourceManager() {
   FileSystemOptions FileOpts;
   FileManager FileMgr(FileOpts);
@@ -70,49 +23,51 @@ static Lexer createTempLexer(llvm::MemoryBufferRef B, SourceManager &FakeSM,
   return Lexer(FakeSM.createFileID(B), B, FakeSM, LangOpts);
 }
 
-std::vector<StyleRange>
-CodeSnippetHighlighter::highlightLine(StringRef SourceLine,
-                                      const LangOptions &LangOpts) {
-  ensureTokenData();
-
+std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
+    StringRef SourceLine, const Preprocessor *PP, const LangOptions &LangOpts) {
   constexpr raw_ostream::Colors CommentColor = raw_ostream::BLACK;
   constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
   constexpr raw_ostream::Colors KeywordColor = raw_ostream::YELLOW;
 
-  const auto MemBuf = llvm::MemoryBuffer::getMemBuffer(SourceLine);
   SourceManager FakeSM = createTempSourceManager();
+  const auto MemBuf = llvm::MemoryBuffer::getMemBuffer(SourceLine);
   Lexer L = createTempLexer(MemBuf->getMemBufferRef(), FakeSM, LangOpts);
   L.SetKeepWhitespaceMode(true);
 
   std::vector<StyleRange> Styles;
   bool Stop = false;
   while (!Stop) {
-    Token tok;
-    Stop = L.LexFromRawLexer(tok);
-    if (tok.is(tok::unknown))
+    Token T;
+    Stop = L.LexFromRawLexer(T);
+    if (T.is(tok::unknown))
       continue;
 
     bool Invalid;
     unsigned Start =
-        FakeSM.getSpellingColumnNumber(tok.getLocation(), &Invalid) - 1;
+        FakeSM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
     if (Invalid)
       continue;
 
-    if (tok.is(tok::raw_identifier)) {
-      // Almost everything we lex is an identifier, since we use a raw lexer.
-      // Some should be highlightes as literals, others as keywords.
-      if (Keywords.contains(tok.getRawIdentifier()))
-        Styles.push_back(
-            StyleRange{Start, Start + tok.getLength(), KeywordColor});
-      else if (Literals.contains(tok.getRawIdentifier()))
+    if (T.is(tok::raw_identifier)) {
+      StringRef RawIdent = T.getRawIdentifier();
+      // Special case true/false/nullptr literals, since they will otherwise be
+      // treated as keywords.
+      if (RawIdent == "true" || RawIdent == "false" || RawIdent == "nullptr") {
         Styles.push_back(
-            StyleRange{Start, Start + tok.getLength(), LiteralColor});
-    } else if (tok::isLiteral(tok.getKind())) {
-      Styles.push_back(
-          StyleRange{Start, Start + tok.getLength(), LiteralColor});
-    } else if (tok.is(tok::comment)) {
-      Styles.push_back(
-          StyleRange{Start, Start + tok.getLength(), CommentColor});
+            StyleRange{Start, Start + T.getLength(), LiteralColor});
+      } else {
+        const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
+        assert(II);
+
+        if (II->isKeyword(LangOpts)) {
+          Styles.push_back(
+              StyleRange{Start, Start + T.getLength(), KeywordColor});
+        }
+      }
+    } else if (tok::isLiteral(T.getKind())) {
+      Styles.push_back(StyleRange{Start, Start + T.getLength(), LiteralColor});
+    } else if (T.is(tok::comment)) {
+      Styles.push_back(StyleRange{Start, Start + T.getLength(), CommentColor});
     }
   }
 
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 13d7d1e048cf99..e840cdd952d09f 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -645,10 +645,10 @@ static bool printWordWrapped(raw_ostream &OS, StringRef Str, unsigned Columns,
   return Wrapped;
 }
 
-TextDiagnostic::TextDiagnostic(raw_ostream &OS,
-                               const LangOptions &LangOpts,
+TextDiagnostic::TextDiagnostic(raw_ostream &OS, const LangOptions &LangOpts,
+                               const Preprocessor *PP,
                                DiagnosticOptions *DiagOpts)
-  : DiagnosticRenderer(LangOpts, DiagOpts), OS(OS) {}
+    : DiagnosticRenderer(LangOpts, DiagOpts), OS(OS), PP(PP) {}
 
 TextDiagnostic::~TextDiagnostic() {}
 
@@ -1280,7 +1280,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
                                  unsigned MaxLineNoDisplayWidth,
                                  unsigned LineNo) {
   std::vector<StyleRange> Styles =
-      SnippetHighlighter.highlightLine(SourceLine, LangOpts);
+      SnippetHighlighter.highlightLine(SourceLine, PP, LangOpts);
 
   // Emit line number.
   if (MaxLineNoDisplayWidth > 0) {
diff --git a/clang/lib/Frontend/TextDiagnosticPrinter.cpp b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
index 0ff5376098ffe8..3bc3935078baad 100644
--- a/clang/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -36,7 +36,7 @@ TextDiagnosticPrinter::~TextDiagnosticPrinter() {
 void TextDiagnosticPrinter::BeginSourceFile(const LangOptions &LO,
                                             const Preprocessor *PP) {
   // Build the TextDiagnostic utility.
-  TextDiag.reset(new TextDiagnostic(OS, LO, &*DiagOpts));
+  TextDiag.reset(new TextDiagnostic(OS, LO, PP, &*DiagOpts));
 }
 
 void TextDiagnosticPrinter::EndSourceFile() {

>From 302d603219b421cd5b30d04af2d13d52b1905440 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 20 Sep 2023 17:24:42 +0200
Subject: [PATCH 03/35] Move the PP parameter to the end of the TextDiagnostic
 ctor

---
 clang/include/clang/Frontend/TextDiagnostic.h | 2 +-
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 2 ++
 clang/lib/Frontend/TextDiagnostic.cpp         | 4 ++--
 clang/lib/Frontend/TextDiagnosticPrinter.cpp  | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 8cdb9b141a8a4a..43c39ff96a2d1c 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -38,7 +38,7 @@ class TextDiagnostic : public DiagnosticRenderer {
 
 public:
   TextDiagnostic(raw_ostream &OS, const LangOptions &LangOpts,
-                 const Preprocessor *PP, DiagnosticOptions *DiagOpts);
+                 DiagnosticOptions *DiagOpts, const Preprocessor *PP = nullptr);
 
   ~TextDiagnostic() override;
 
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 63b3707fbb7ef8..32bd61f3746023 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -25,6 +25,8 @@ static Lexer createTempLexer(llvm::MemoryBufferRef B, SourceManager &FakeSM,
 
 std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
     StringRef SourceLine, const Preprocessor *PP, const LangOptions &LangOpts) {
+  if (!PP)
+    return {};
   constexpr raw_ostream::Colors CommentColor = raw_ostream::BLACK;
   constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
   constexpr raw_ostream::Colors KeywordColor = raw_ostream::YELLOW;
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index e840cdd952d09f..c9207e9dfbf921 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -646,8 +646,8 @@ static bool printWordWrapped(raw_ostream &OS, StringRef Str, unsigned Columns,
 }
 
 TextDiagnostic::TextDiagnostic(raw_ostream &OS, const LangOptions &LangOpts,
-                               const Preprocessor *PP,
-                               DiagnosticOptions *DiagOpts)
+                               DiagnosticOptions *DiagOpts,
+                               const Preprocessor *PP)
     : DiagnosticRenderer(LangOpts, DiagOpts), OS(OS), PP(PP) {}
 
 TextDiagnostic::~TextDiagnostic() {}
diff --git a/clang/lib/Frontend/TextDiagnosticPrinter.cpp b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
index 3bc3935078baad..b2fb762537573e 100644
--- a/clang/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -36,7 +36,7 @@ TextDiagnosticPrinter::~TextDiagnosticPrinter() {
 void TextDiagnosticPrinter::BeginSourceFile(const LangOptions &LO,
                                             const Preprocessor *PP) {
   // Build the TextDiagnostic utility.
-  TextDiag.reset(new TextDiagnostic(OS, LO, PP, &*DiagOpts));
+  TextDiag.reset(new TextDiagnostic(OS, LO, &*DiagOpts, PP));
 }
 
 void TextDiagnosticPrinter::EndSourceFile() {

>From 0ff47e91abe07d03e70623c2f1d322db817f9393 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Thu, 21 Sep 2023 06:38:24 +0200
Subject: [PATCH 04/35] Tune colors

---
 clang/include/clang/Frontend/CodeSnippetHighlighter.h | 2 +-
 clang/lib/Frontend/CodeSnippetHighlighter.cpp         | 6 +++---
 clang/lib/Frontend/TextDiagnostic.cpp                 | 7 ++++---
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index ec03375221f9ff..c2a0184085d5da 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -19,7 +19,7 @@ namespace clang {
 struct StyleRange {
   unsigned Start;
   unsigned End;
-  const enum llvm::raw_ostream::Colors c;
+  const enum llvm::raw_ostream::Colors color;
 };
 
 class Preprocessor;
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 32bd61f3746023..dba7f5d2848505 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -27,9 +27,9 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
     StringRef SourceLine, const Preprocessor *PP, const LangOptions &LangOpts) {
   if (!PP)
     return {};
-  constexpr raw_ostream::Colors CommentColor = raw_ostream::BLACK;
-  constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
-  constexpr raw_ostream::Colors KeywordColor = raw_ostream::YELLOW;
+  constexpr raw_ostream::Colors CommentColor = raw_ostream::GREEN;
+  constexpr raw_ostream::Colors LiteralColor = raw_ostream::CYAN;
+  constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 
   SourceManager FakeSM = createTempSourceManager();
   const auto MemBuf = llvm::MemoryBuffer::getMemBuffer(SourceLine);
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index c9207e9dfbf921..35a92a8044f2e5 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Locale.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <optional>
@@ -1307,14 +1308,14 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
       std::optional<enum raw_ostream::Colors> H;
       for (auto &P : Styles) {
         if (P.Start < I && P.End >= I) {
-          H = P.c;
+          H = P.color;
           break;
         }
       }
 
-      if (H) {
+      if (H)
         OS.changeColor(*H, false);
-      } else
+      else
         OS.resetColor();
     }
 

>From c25d12d894c75553d5b9ae5a00aae1282a151314 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Thu, 21 Sep 2023 11:01:43 +0200
Subject: [PATCH 05/35] Lex the entire file

---
 .../clang/Frontend/CodeSnippetHighlighter.h   |   7 +-
 clang/include/clang/Frontend/TextDiagnostic.h |   3 +-
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 138 ++++++++++++------
 clang/lib/Frontend/TextDiagnostic.cpp         |  14 +-
 4 files changed, 111 insertions(+), 51 deletions(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index c2a0184085d5da..51c14880fb9548 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -23,6 +23,8 @@ struct StyleRange {
 };
 
 class Preprocessor;
+class FileID;
+class SourceManager;
 
 class CodeSnippetHighlighter final {
 public:
@@ -31,9 +33,10 @@ class CodeSnippetHighlighter final {
   /// Produce StyleRanges for the given line.
   /// The returned vector contains non-overlapping style ranges. They are sorted
   /// from beginning of the line to the end.
-  std::vector<StyleRange> highlightLine(llvm::StringRef SourceLine,
+  std::vector<StyleRange> highlightLine(unsigned LineNumber,
                                         const Preprocessor *PP,
-                                        const LangOptions &LangOpts);
+                                        const LangOptions &LangOpts, FileID FID,
+                                        const SourceManager &SM);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 43c39ff96a2d1c..102b33aedd5ef9 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -105,7 +105,8 @@ class TextDiagnostic : public DiagnosticRenderer {
                            ArrayRef<FixItHint> Hints);
 
   void emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth,
-                   unsigned LineNo);
+                   FileID FID, const SourceManager &SM, unsigned LineNo,
+                   unsigned DisplayLineNo);
 
   void emitParseableFixits(ArrayRef<FixItHint> Hints, const SourceManager &SM);
 };
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index dba7f5d2848505..d319e690a355b2 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -9,34 +9,47 @@
 
 using namespace clang;
 
-static SourceManager createTempSourceManager() {
-  FileSystemOptions FileOpts;
-  FileManager FileMgr(FileOpts);
-  llvm::IntrusiveRefCntPtr<DiagnosticIDs> DiagIDs(new DiagnosticIDs());
-  llvm::IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts(new DiagnosticOptions());
-  DiagnosticsEngine diags(DiagIDs, DiagOpts);
-  return SourceManager(diags, FileMgr);
-}
-
-static Lexer createTempLexer(llvm::MemoryBufferRef B, SourceManager &FakeSM,
-                             const LangOptions &LangOpts) {
-  return Lexer(FakeSM.createFileID(B), B, FakeSM, LangOpts);
-}
+static constexpr raw_ostream::Colors CommentColor = raw_ostream::GREEN;
+static constexpr raw_ostream::Colors LiteralColor = raw_ostream::CYAN;
+static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 
 std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
-    StringRef SourceLine, const Preprocessor *PP, const LangOptions &LangOpts) {
+    unsigned LineNumber, const Preprocessor *PP, const LangOptions &LangOpts,
+    FileID FID, const SourceManager &SM) {
   if (!PP)
     return {};
-  constexpr raw_ostream::Colors CommentColor = raw_ostream::GREEN;
-  constexpr raw_ostream::Colors LiteralColor = raw_ostream::CYAN;
-  constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 
-  SourceManager FakeSM = createTempSourceManager();
-  const auto MemBuf = llvm::MemoryBuffer::getMemBuffer(SourceLine);
-  Lexer L = createTempLexer(MemBuf->getMemBufferRef(), FakeSM, LangOpts);
+  // Classify the given token and append it to the given vector.
+  auto appendStyle = [PP, &LangOpts](std::vector<StyleRange> &Vec,
+                                     const Token &T, unsigned Start,
+                                     unsigned Length) -> void {
+    if (T.is(tok::raw_identifier)) {
+      StringRef RawIdent = T.getRawIdentifier();
+      // Special case true/false/nullptr literals, since they will otherwise be
+      // treated as keywords.
+      if (RawIdent == "true" || RawIdent == "false" || RawIdent == "nullptr") {
+        Vec.push_back(StyleRange{Start, Start + Length, LiteralColor});
+      } else {
+        const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
+        assert(II);
+
+        if (II->isKeyword(LangOpts)) {
+          Vec.push_back(StyleRange{Start, Start + Length, KeywordColor});
+        }
+      }
+    } else if (tok::isLiteral(T.getKind())) {
+      Vec.push_back(StyleRange{Start, Start + Length, LiteralColor});
+    } else if (T.is(tok::comment)) {
+      Vec.push_back(StyleRange{Start, Start + Length, CommentColor});
+    }
+  };
+
+  auto Buff = SM.getBufferOrNone(FID);
+  assert(Buff);
+  Lexer L = Lexer(FID, *Buff, SM, LangOpts);
   L.SetKeepWhitespaceMode(true);
+  std::vector<std::vector<StyleRange>> Lines;
 
-  std::vector<StyleRange> Styles;
   bool Stop = false;
   while (!Stop) {
     Token T;
@@ -45,33 +58,74 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
       continue;
 
     bool Invalid;
-    unsigned Start =
-        FakeSM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
+    unsigned StartCol =
+        SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+    unsigned StartLine =
+        SM.getSpellingLineNumber(T.getLocation(), &Invalid) - 1;
     if (Invalid)
       continue;
 
-    if (T.is(tok::raw_identifier)) {
-      StringRef RawIdent = T.getRawIdentifier();
-      // Special case true/false/nullptr literals, since they will otherwise be
-      // treated as keywords.
-      if (RawIdent == "true" || RawIdent == "false" || RawIdent == "nullptr") {
-        Styles.push_back(
-            StyleRange{Start, Start + T.getLength(), LiteralColor});
-      } else {
-        const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
-        assert(II);
+    while (Lines.size() <= StartLine)
+      Lines.push_back({});
 
-        if (II->isKeyword(LangOpts)) {
-          Styles.push_back(
-              StyleRange{Start, Start + T.getLength(), KeywordColor});
-        }
+    unsigned EndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+
+    // Simple tokens.
+    if (StartLine == EndLine) {
+      appendStyle(Lines[StartLine], T, StartCol, T.getLength());
+      continue;
+    }
+    unsigned NumLines = EndLine - StartLine;
+
+    // For tokens that span multiple lines (think multiline comments), we
+    // divide them into multiple StyleRanges.
+    unsigned EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+
+    std::string Spelling = Lexer::getSpelling(T, SM, LangOpts);
+
+    unsigned L = 0;
+    unsigned LineLength = 0;
+    for (unsigned I = 0; I <= Spelling.size(); ++I) {
+      // This line is done.
+      if (Spelling[I] == '\n' || Spelling[I] == '\r' || I == Spelling.size()) {
+        while (Lines.size() <= StartLine + L)
+          Lines.push_back({});
+
+        if (L == 0) // First line
+          appendStyle(Lines[StartLine + L], T, StartCol, LineLength);
+        else if (L == NumLines) // Last line
+          appendStyle(Lines[StartLine + L], T, 0, EndCol);
+        else
+          appendStyle(Lines[StartLine + L], T, 0, LineLength);
+        ++L;
+        LineLength = 0;
+        continue;
       }
-    } else if (tok::isLiteral(T.getKind())) {
-      Styles.push_back(StyleRange{Start, Start + T.getLength(), LiteralColor});
-    } else if (T.is(tok::comment)) {
-      Styles.push_back(StyleRange{Start, Start + T.getLength(), CommentColor});
+      ++LineLength;
+    }
+  }
+
+#if 0
+  llvm::errs() << "--\nLine Style info: \n";
+  int I = 0;
+  for (std::vector<StyleRange> &Line : Lines) {
+    llvm::errs() << I << ": ";
+    for (const auto &R : Line) {
+      llvm::errs() << "{" << R.Start << ", " << R.End << "}, ";
     }
+    llvm::errs() << "\n";
+
+    ++I;
   }
+#endif
 
-  return Styles;
+  while (Lines.size() <= LineNumber)
+    Lines.push_back({});
+  return Lines[LineNumber];
 }
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 35a92a8044f2e5..5aea7b8f421074 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1249,7 +1249,8 @@ void TextDiagnostic::emitSnippetAndCaret(
     }
 
     // Emit what we have computed.
-    emitSnippet(SourceLine, MaxLineNoDisplayWidth, DisplayLineNo);
+    emitSnippet(SourceLine, MaxLineNoDisplayWidth, FID, SM, LineNo,
+                DisplayLineNo);
 
     if (!CaretLine.empty()) {
       indentForLineNumbers();
@@ -1278,16 +1279,17 @@ void TextDiagnostic::emitSnippetAndCaret(
 }
 
 void TextDiagnostic::emitSnippet(StringRef SourceLine,
-                                 unsigned MaxLineNoDisplayWidth,
-                                 unsigned LineNo) {
+                                 unsigned MaxLineNoDisplayWidth, FileID FID,
+                                 const SourceManager &SM, unsigned LineNo,
+                                 unsigned DisplayLineNo) {
   std::vector<StyleRange> Styles =
-      SnippetHighlighter.highlightLine(SourceLine, PP, LangOpts);
+      SnippetHighlighter.highlightLine(LineNo - 1, PP, LangOpts, FID, SM);
 
   // Emit line number.
   if (MaxLineNoDisplayWidth > 0) {
-    unsigned LineNoDisplayWidth = getNumDisplayWidth(LineNo);
+    unsigned LineNoDisplayWidth = getNumDisplayWidth(DisplayLineNo);
     OS.indent(MaxLineNoDisplayWidth - LineNoDisplayWidth + 1)
-        << LineNo << " | ";
+        << DisplayLineNo << " | ";
   }
 
   // Print the source line one character at a time.

>From 51af576ba14cfbcff472d3d3632e562c1a9df136 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Fri, 22 Sep 2023 06:48:55 +0200
Subject: [PATCH 06/35] Try to fix PCH test

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index d319e690a355b2..8905fbfb29b892 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -19,6 +19,10 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
   if (!PP)
     return {};
 
+  // Might cause emission of another diagnostic.
+  if (PP->getIdentifierTable().getExternalIdentifierLookup())
+    return {};
+
   // Classify the given token and append it to the given vector.
   auto appendStyle = [PP, &LangOpts](std::vector<StyleRange> &Vec,
                                      const Token &T, unsigned Start,

>From 11e024cc2b1f087e3bcd7e70bd675936c58b34ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 26 Sep 2023 08:11:58 +0200
Subject: [PATCH 07/35] Measurements

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 8905fbfb29b892..17614a962ee6a7 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -6,6 +6,7 @@
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/PreprocessorOptions.h"
 #include "llvm/Support/raw_ostream.h"
+#include <chrono>
 
 using namespace clang;
 
@@ -16,6 +17,9 @@ static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
     unsigned LineNumber, const Preprocessor *PP, const LangOptions &LangOpts,
     FileID FID, const SourceManager &SM) {
+  std::chrono::steady_clock::time_point begin =
+      std::chrono::steady_clock::now();
+
   if (!PP)
     return {};
 
@@ -23,6 +27,7 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
   if (PP->getIdentifierTable().getExternalIdentifierLookup())
     return {};
 
+  size_t NTokens = 0;
   // Classify the given token and append it to the given vector.
   auto appendStyle = [PP, &LangOpts](std::vector<StyleRange> &Vec,
                                      const Token &T, unsigned Start,
@@ -56,6 +61,7 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
 
   bool Stop = false;
   while (!Stop) {
+    ++NTokens;
     Token T;
     Stop = L.LexFromRawLexer(T);
     if (T.is(tok::unknown))
@@ -131,5 +137,23 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
 
   while (Lines.size() <= LineNumber)
     Lines.push_back({});
+
+  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
+  llvm::errs() << "Lexed " << Lines.size() << " lines and " << NTokens
+               << " Tokens\n";
+  llvm::errs() << "That took "
+               << std::chrono::duration_cast<std::chrono::microseconds>(end -
+                                                                        begin)
+                      .count()
+               << " microseconds\n";
+  llvm::errs() << "That took "
+               << std::chrono::duration_cast<std::chrono::milliseconds>(end -
+                                                                        begin)
+                      .count()
+               << " milliseconds\n";
+  llvm::errs()
+      << "That took "
+      << std::chrono::duration_cast<std::chrono::seconds>(end - begin).count()
+      << " seconds\n";
   return Lines[LineNumber];
 }

>From 77267b8f15b2c66fd774596b04db86163db85d9e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 27 Sep 2023 08:05:09 +0200
Subject: [PATCH 08/35] Slightly improve performance by bailing out earlier

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 17614a962ee6a7..71e5c30e56d146 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -41,14 +41,13 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
       } else {
         const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
         assert(II);
-
-        if (II->isKeyword(LangOpts)) {
+        if (II->isKeyword(LangOpts))
           Vec.push_back(StyleRange{Start, Start + Length, KeywordColor});
-        }
       }
     } else if (tok::isLiteral(T.getKind())) {
       Vec.push_back(StyleRange{Start, Start + Length, LiteralColor});
-    } else if (T.is(tok::comment)) {
+    } else {
+      assert(T.is(tok::comment));
       Vec.push_back(StyleRange{Start, Start + Length, CommentColor});
     }
   };
@@ -67,6 +66,11 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
     if (T.is(tok::unknown))
       continue;
 
+    // We are only interested in identifiers, literals and comments.
+    if (!T.is(tok::raw_identifier) && !T.is(tok::comment) &&
+        !tok::isLiteral(T.getKind()))
+      continue;
+
     bool Invalid;
     unsigned StartCol =
         SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
@@ -138,6 +142,7 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
   while (Lines.size() <= LineNumber)
     Lines.push_back({});
 
+#if 0
   std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
   llvm::errs() << "Lexed " << Lines.size() << " lines and " << NTokens
                << " Tokens\n";
@@ -155,5 +160,6 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
       << "That took "
       << std::chrono::duration_cast<std::chrono::seconds>(end - begin).count()
       << " seconds\n";
+#endif
   return Lines[LineNumber];
 }

>From 81c3d1c30a7fbf53081914c4172899c566d47979 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 27 Sep 2023 10:45:36 +0200
Subject: [PATCH 09/35] Only care about tokens that touch our LineNumber.

---
 .../clang/Frontend/CodeSnippetHighlighter.h   |  2 +-
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 19 +++++++++++++------
 clang/lib/Frontend/TextDiagnostic.cpp         |  2 +-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index 51c14880fb9548..a65bd3991d4eff 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -19,7 +19,7 @@ namespace clang {
 struct StyleRange {
   unsigned Start;
   unsigned End;
-  const enum llvm::raw_ostream::Colors color;
+  const enum llvm::raw_ostream::Colors Color;
 };
 
 class Preprocessor;
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 71e5c30e56d146..7663155c6c8392 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -71,23 +71,30 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
         !tok::isLiteral(T.getKind()))
       continue;
 
-    bool Invalid;
-    unsigned StartCol =
-        SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
+    bool Invalid = false;
+    unsigned EndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid) - 1;
     if (Invalid)
       continue;
+
+    if (EndLine < LineNumber)
+      continue;
     unsigned StartLine =
         SM.getSpellingLineNumber(T.getLocation(), &Invalid) - 1;
     if (Invalid)
       continue;
+    if (StartLine > LineNumber)
+      break;
 
-    while (Lines.size() <= StartLine)
-      Lines.push_back({});
+    // Must have an intersection at this point
+    assert(StartLine <= LineNumber && EndLine >= LineNumber);
 
-    unsigned EndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid) - 1;
+    unsigned StartCol =
+        SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
     if (Invalid)
       continue;
 
+    while (Lines.size() <= StartLine)
+      Lines.push_back({});
     // Simple tokens.
     if (StartLine == EndLine) {
       appendStyle(Lines[StartLine], T, StartCol, T.getLength());
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 5aea7b8f421074..f2793d23522f1a 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1310,7 +1310,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
       std::optional<enum raw_ostream::Colors> H;
       for (auto &P : Styles) {
         if (P.Start < I && P.End >= I) {
-          H = P.color;
+          H = P.Color;
           break;
         }
       }

>From 02a80357b155fe41baea8c902c3a6d39d63191e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Fri, 6 Oct 2023 15:28:25 +0200
Subject: [PATCH 10/35] Add checkpoints to Preprocessor

---
 .../clang/Frontend/CodeSnippetHighlighter.h   | 13 ++--
 clang/include/clang/Frontend/TextDiagnostic.h |  2 +-
 clang/include/clang/Lex/Preprocessor.h        |  5 ++
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 64 +++++++++++--------
 clang/lib/Frontend/TextDiagnostic.cpp         | 10 +--
 clang/lib/Lex/Preprocessor.cpp                | 27 ++++++++
 6 files changed, 81 insertions(+), 40 deletions(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index a65bd3991d4eff..451a182b3e3531 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -10,16 +10,15 @@
 #define LLVM_CLANG_FRONTEND_CODESNIPPETHIGHLIGHTER_H
 
 #include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
-#include <vector>
 
 namespace clang {
 
 struct StyleRange {
   unsigned Start;
   unsigned End;
-  const enum llvm::raw_ostream::Colors Color;
+  enum llvm::raw_ostream::Colors Color;
 };
 
 class Preprocessor;
@@ -33,10 +32,10 @@ class CodeSnippetHighlighter final {
   /// Produce StyleRanges for the given line.
   /// The returned vector contains non-overlapping style ranges. They are sorted
   /// from beginning of the line to the end.
-  std::vector<StyleRange> highlightLine(unsigned LineNumber,
-                                        const Preprocessor *PP,
-                                        const LangOptions &LangOpts, FileID FID,
-                                        const SourceManager &SM);
+  llvm::SmallVector<StyleRange>
+  highlightLine(unsigned LineNumber, const Preprocessor *PP,
+                const LangOptions &LangOpts, FileID FID,
+                const SourceManager &SM, const char *LineStart);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 102b33aedd5ef9..ecd5bb4a4f568d 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -106,7 +106,7 @@ class TextDiagnostic : public DiagnosticRenderer {
 
   void emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth,
                    FileID FID, const SourceManager &SM, unsigned LineNo,
-                   unsigned DisplayLineNo);
+                   unsigned DisplayLineNo, const char *LineStart);
 
   void emitParseableFixits(ArrayRef<FixItHint> Hints, const SourceManager &SM);
 };
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4ec21a8b6be2c8..07c44794520f66 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -128,6 +128,7 @@ enum MacroUse {
 class Preprocessor {
   friend class VAOptDefinitionContext;
   friend class VariadicMacroScopeGuard;
+  friend class CodeSnippetHighlighter;
 
   llvm::unique_function<void(const clang::Token &)> OnToken;
   std::shared_ptr<PreprocessorOptions> PPOpts;
@@ -141,6 +142,10 @@ class Preprocessor {
   HeaderSearch      &HeaderInfo;
   ModuleLoader      &TheModuleLoader;
 
+  llvm::SmallVector<const char *> CheckPoints;
+  void saveCheckPoint(const char *P);
+  const char *getSaveFor(const char *S) const;
+
   /// External source of macros.
   ExternalPreprocessorSource *ExternalSource;
 
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 7663155c6c8392..28b66d4c05b8c8 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -14,9 +14,9 @@ static constexpr raw_ostream::Colors CommentColor = raw_ostream::GREEN;
 static constexpr raw_ostream::Colors LiteralColor = raw_ostream::CYAN;
 static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 
-std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
+llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     unsigned LineNumber, const Preprocessor *PP, const LangOptions &LangOpts,
-    FileID FID, const SourceManager &SM) {
+    FileID FID, const SourceManager &SM, const char *LineStart) {
   std::chrono::steady_clock::time_point begin =
       std::chrono::steady_clock::now();
 
@@ -29,7 +29,7 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
 
   size_t NTokens = 0;
   // Classify the given token and append it to the given vector.
-  auto appendStyle = [PP, &LangOpts](std::vector<StyleRange> &Vec,
+  auto appendStyle = [PP, &LangOpts](llvm::SmallVector<StyleRange> &Vec,
                                      const Token &T, unsigned Start,
                                      unsigned Length) -> void {
     if (T.is(tok::raw_identifier)) {
@@ -52,12 +52,23 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
     }
   };
 
+  // Figure out where to start lexing from.
   auto Buff = SM.getBufferOrNone(FID);
   assert(Buff);
   Lexer L = Lexer(FID, *Buff, SM, LangOpts);
   L.SetKeepWhitespaceMode(true);
-  std::vector<std::vector<StyleRange>> Lines;
 
+  // Seek to the last save point before the start of the line.
+  if (const char *Save = PP->getSaveFor(LineStart);
+      Buff->getBufferStart() <= Save && Save < Buff->getBufferEnd()) {
+    size_t Offset = Save - Buff->getBufferStart();
+    assert(Save >= Buff->getBufferStart());
+    assert(Save <= Buff->getBufferEnd());
+
+    L.seek(Offset, /*IsAtStartOfLine=*/true);
+  }
+
+  llvm::SmallVector<StyleRange> LineRanges;
   bool Stop = false;
   while (!Stop) {
     ++NTokens;
@@ -93,14 +104,13 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
     if (Invalid)
       continue;
 
-    while (Lines.size() <= StartLine)
-      Lines.push_back({});
     // Simple tokens.
     if (StartLine == EndLine) {
-      appendStyle(Lines[StartLine], T, StartCol, T.getLength());
+      appendStyle(LineRanges, T, StartCol, T.getLength());
       continue;
     }
     unsigned NumLines = EndLine - StartLine;
+    assert(NumLines >= 1);
 
     // For tokens that span multiple lines (think multiline comments), we
     // divide them into multiple StyleRanges.
@@ -115,15 +125,17 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
     for (unsigned I = 0; I <= Spelling.size(); ++I) {
       // This line is done.
       if (Spelling[I] == '\n' || Spelling[I] == '\r' || I == Spelling.size()) {
-        while (Lines.size() <= StartLine + L)
-          Lines.push_back({});
-
-        if (L == 0) // First line
-          appendStyle(Lines[StartLine + L], T, StartCol, LineLength);
-        else if (L == NumLines) // Last line
-          appendStyle(Lines[StartLine + L], T, 0, EndCol);
-        else
-          appendStyle(Lines[StartLine + L], T, 0, LineLength);
+        if (StartLine + L == LineNumber) {
+          if (L == 0) // First line
+            appendStyle(LineRanges, T, StartCol, LineLength);
+          else if (L == NumLines) // Last line
+            appendStyle(LineRanges, T, 0, EndCol);
+          else
+            appendStyle(LineRanges, T, 0, LineLength);
+
+          // We only do one line, so we're done.
+          break;
+        }
         ++L;
         LineLength = 0;
         continue;
@@ -134,25 +146,21 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
 
 #if 0
   llvm::errs() << "--\nLine Style info: \n";
-  int I = 0;
-  for (std::vector<StyleRange> &Line : Lines) {
-    llvm::errs() << I << ": ";
-    for (const auto &R : Line) {
+  //int I = 0;
+  //for (std::vector<StyleRange> &Line : Lines) {
+    //llvm::errs() << I << ": ";
+    for (const auto &R : LineRanges) {
       llvm::errs() << "{" << R.Start << ", " << R.End << "}, ";
     }
     llvm::errs() << "\n";
 
-    ++I;
-  }
+    //++I;
+  //}
 #endif
 
-  while (Lines.size() <= LineNumber)
-    Lines.push_back({});
-
 #if 0
   std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-  llvm::errs() << "Lexed " << Lines.size() << " lines and " << NTokens
-               << " Tokens\n";
+  llvm::errs() << "Lexed " << NTokens << " Tokens\n";
   llvm::errs() << "That took "
                << std::chrono::duration_cast<std::chrono::microseconds>(end -
                                                                         begin)
@@ -168,5 +176,5 @@ std::vector<StyleRange> CodeSnippetHighlighter::highlightLine(
       << std::chrono::duration_cast<std::chrono::seconds>(end - begin).count()
       << " seconds\n";
 #endif
-  return Lines[LineNumber];
+  return LineRanges;
 }
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index f2793d23522f1a..cbc0cfacec20f0 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -13,6 +13,7 @@
 #include "clang/Basic/SourceManager.h"
 #include "clang/Frontend/CodeSnippetHighlighter.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ConvertUTF.h"
@@ -1250,7 +1251,7 @@ void TextDiagnostic::emitSnippetAndCaret(
 
     // Emit what we have computed.
     emitSnippet(SourceLine, MaxLineNoDisplayWidth, FID, SM, LineNo,
-                DisplayLineNo);
+                DisplayLineNo, LineStart);
 
     if (!CaretLine.empty()) {
       indentForLineNumbers();
@@ -1281,9 +1282,10 @@ void TextDiagnostic::emitSnippetAndCaret(
 void TextDiagnostic::emitSnippet(StringRef SourceLine,
                                  unsigned MaxLineNoDisplayWidth, FileID FID,
                                  const SourceManager &SM, unsigned LineNo,
-                                 unsigned DisplayLineNo) {
-  std::vector<StyleRange> Styles =
-      SnippetHighlighter.highlightLine(LineNo - 1, PP, LangOpts, FID, SM);
+                                 unsigned DisplayLineNo,
+                                 const char *LineStart) {
+  llvm::SmallVector<StyleRange> Styles = SnippetHighlighter.highlightLine(
+      LineNo - 1, PP, LangOpts, FID, SM, LineStart);
 
   // Emit line number.
   if (MaxLineNoDisplayWidth > 0) {
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 64f54c6fc6382f..d865326bcfa6dd 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -546,6 +546,7 @@ void Preprocessor::EnterMainSourceFile() {
   // information) and predefined macros aren't guaranteed to be set properly.
   assert(NumEnteredSourceFiles == 0 && "Cannot reenter the main file!");
   FileID MainFileID = SourceMgr.getMainFileID();
+  // llvm::errs() << "##### Main source file: " << (int)MainFileID << "\n";
 
   // If MainFileID is loaded it means we loaded an AST file, no need to enter
   // a main file.
@@ -862,6 +863,32 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
   return true;
 }
 
+void Preprocessor::saveCheckPoint(const char *P) {
+  static constexpr ptrdiff_t Limit = 1000;
+  if (CheckPoints.empty()) {
+    CheckPoints.push_back(P);
+    return;
+  }
+
+  const char *Cur = CheckPoints.back();
+  if (Cur == P)
+    return;
+  if ((P - Cur) > Limit)
+    CheckPoints.push_back(P);
+}
+
+const char *Preprocessor::getSaveFor(const char *S) const {
+  const char *C = S;
+  // FIXME: Use std::lower_bound or something smart. Aaron knows what I'm
+  // talking about.
+  for (ssize_t I = CheckPoints.size() - 1; I >= 0; --I) {
+    C = CheckPoints[I];
+    if (CheckPoints[I] <= S)
+      break;
+  }
+  return C;
+}
+
 void Preprocessor::Lex(Token &Result) {
   ++LexLevel;
 

>From 1e06f993902701e15dc39014f37f77b79f448e1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sat, 7 Oct 2023 12:35:21 +0200
Subject: [PATCH 11/35] Add missing license header

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 28b66d4c05b8c8..042745fc639ec3 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -1,3 +1,10 @@
+//===-- CodeSnippetHighlighter.cpp - Code snippet highlighting --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
 
 #include "clang/Frontend/CodeSnippetHighlighter.h"
 #include "clang/Basic/DiagnosticOptions.h"

>From 5fbb86e73348cedc91177e818445e823f065ce1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 10 Oct 2023 14:04:43 +0200
Subject: [PATCH 12/35] Fewer checkpoints

---
 clang/lib/Lex/Preprocessor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index d865326bcfa6dd..f5366d04a09e5a 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -864,7 +864,7 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
 }
 
 void Preprocessor::saveCheckPoint(const char *P) {
-  static constexpr ptrdiff_t Limit = 1000;
+  static constexpr ptrdiff_t Limit = 1024 * 8;
   if (CheckPoints.empty()) {
     CheckPoints.push_back(P);
     return;

>From 86a4842303ee31a3ec0cfd7d6635e8cdadf110c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Fri, 13 Oct 2023 11:11:06 +0200
Subject: [PATCH 13/35] Cleanup

---
 clang/lib/Lex/Preprocessor.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index f5366d04a09e5a..bc57cd663d04fd 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -546,7 +546,6 @@ void Preprocessor::EnterMainSourceFile() {
   // information) and predefined macros aren't guaranteed to be set properly.
   assert(NumEnteredSourceFiles == 0 && "Cannot reenter the main file!");
   FileID MainFileID = SourceMgr.getMainFileID();
-  // llvm::errs() << "##### Main source file: " << (int)MainFileID << "\n";
 
   // If MainFileID is loaded it means we loaded an AST file, no need to enter
   // a main file.
@@ -878,15 +877,8 @@ void Preprocessor::saveCheckPoint(const char *P) {
 }
 
 const char *Preprocessor::getSaveFor(const char *S) const {
-  const char *C = S;
-  // FIXME: Use std::lower_bound or something smart. Aaron knows what I'm
-  // talking about.
-  for (ssize_t I = CheckPoints.size() - 1; I >= 0; --I) {
-    C = CheckPoints[I];
-    if (CheckPoints[I] <= S)
-      break;
-  }
-  return C;
+  auto It = llvm::lower_bound(CheckPoints, S, std::less<const char *>());
+  return *It;
 }
 
 void Preprocessor::Lex(Token &Result) {

>From 30a137fe2cb2a74cb3ec004b5d213ce1eb146d4b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 15 Oct 2023 17:00:36 +0200
Subject: [PATCH 14/35] Address some review comments

---
 clang/include/clang/Frontend/CodeSnippetHighlighter.h | 2 ++
 clang/lib/Frontend/CodeSnippetHighlighter.cpp         | 9 +++++----
 clang/lib/Frontend/TextDiagnostic.cpp                 | 1 -
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index 451a182b3e3531..cb3c96f6929379 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -19,6 +19,8 @@ struct StyleRange {
   unsigned Start;
   unsigned End;
   enum llvm::raw_ostream::Colors Color;
+  StyleRange(unsigned S, unsigned E, enum llvm::raw_ostream::Colors C)
+      : Start(S), End(E), Color(C){};
 };
 
 class Preprocessor;
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 042745fc639ec3..30c4c791cb4f82 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Frontend/CodeSnippetHighlighter.h"
+#include "clang/Basic/CharInfo.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
@@ -49,13 +50,13 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
         const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
         assert(II);
         if (II->isKeyword(LangOpts))
-          Vec.push_back(StyleRange{Start, Start + Length, KeywordColor});
+          Vec.emplace_back(Start, Start + Length, KeywordColor);
       }
     } else if (tok::isLiteral(T.getKind())) {
-      Vec.push_back(StyleRange{Start, Start + Length, LiteralColor});
+      Vec.emplace_back(Start, Start + Length, LiteralColor);
     } else {
       assert(T.is(tok::comment));
-      Vec.push_back(StyleRange{Start, Start + Length, CommentColor});
+      Vec.emplace_back(Start, Start + Length, CommentColor);
     }
   };
 
@@ -131,7 +132,7 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     unsigned LineLength = 0;
     for (unsigned I = 0; I <= Spelling.size(); ++I) {
       // This line is done.
-      if (Spelling[I] == '\n' || Spelling[I] == '\r' || I == Spelling.size()) {
+      if (isVerticalWhitespace(Spelling[I]) || I == Spelling.size()) {
         if (StartLine + L == LineNumber) {
           if (L == 0) // First line
             appendStyle(LineRanges, T, StartCol, LineLength);
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index cbc0cfacec20f0..033d21656b1272 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -20,7 +20,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Locale.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/Process.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <optional>

>From de5dcdb105dfbea904b4992b48b8ae33f3266f3f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Mon, 16 Oct 2023 07:21:41 +0200
Subject: [PATCH 15/35] Fix highlighting and add another assertion

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp |  1 +
 clang/lib/Lex/Preprocessor.cpp                | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 30c4c791cb4f82..7a3fdc1e0d16ea 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -72,6 +72,7 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     size_t Offset = Save - Buff->getBufferStart();
     assert(Save >= Buff->getBufferStart());
     assert(Save <= Buff->getBufferEnd());
+    assert(Save <= LineStart);
 
     L.seek(Offset, /*IsAtStartOfLine=*/true);
   }
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index bc57cd663d04fd..bc8302d7fb9489 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -876,9 +876,19 @@ void Preprocessor::saveCheckPoint(const char *P) {
     CheckPoints.push_back(P);
 }
 
+/// We want to always return a value lower than \p S.
+/// If there is no such checkpoint, return nullptr.
 const char *Preprocessor::getSaveFor(const char *S) const {
-  auto It = llvm::lower_bound(CheckPoints, S, std::less<const char *>());
-  return *It;
+  const char *Result = nullptr;
+  for (ssize_t I = CheckPoints.size() - 1; I >= 0; --I) {
+    const char *C = CheckPoints[I];
+    if (C <= S) {
+      Result = C;
+      break;
+    }
+  }
+
+  return Result;
 }
 
 void Preprocessor::Lex(Token &Result) {

>From 156fb9bb36e650187b40128a38455291b59e81e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Mon, 16 Oct 2023 07:51:10 +0200
Subject: [PATCH 16/35] Change colors one last time

To match those used in LLDB
---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 7a3fdc1e0d16ea..316a151c9943c9 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -18,9 +18,9 @@
 
 using namespace clang;
 
-static constexpr raw_ostream::Colors CommentColor = raw_ostream::GREEN;
-static constexpr raw_ostream::Colors LiteralColor = raw_ostream::CYAN;
-static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
+static constexpr raw_ostream::Colors CommentColor = raw_ostream::MAGENTA;
+static constexpr raw_ostream::Colors LiteralColor = raw_ostream::RED;
+static constexpr raw_ostream::Colors KeywordColor = raw_ostream::GREEN;
 
 llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     unsigned LineNumber, const Preprocessor *PP, const LangOptions &LangOpts,
@@ -45,7 +45,7 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
       // Special case true/false/nullptr literals, since they will otherwise be
       // treated as keywords.
       if (RawIdent == "true" || RawIdent == "false" || RawIdent == "nullptr") {
-        Vec.push_back(StyleRange{Start, Start + Length, LiteralColor});
+        Vec.emplace_back(Start, Start + Length, LiteralColor);
       } else {
         const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
         assert(II);

>From 1577a7648ab3ad21f0929df7be84d91d5b082281 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 17 Oct 2023 06:13:12 +0200
Subject: [PATCH 17/35] Address review comments

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp |  2 +-
 clang/lib/Frontend/TextDiagnostic.cpp         | 15 +++++----------
 clang/lib/Lex/Preprocessor.cpp                | 16 ++++++----------
 3 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 316a151c9943c9..715c113d519438 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -20,7 +20,7 @@ using namespace clang;
 
 static constexpr raw_ostream::Colors CommentColor = raw_ostream::MAGENTA;
 static constexpr raw_ostream::Colors LiteralColor = raw_ostream::RED;
-static constexpr raw_ostream::Colors KeywordColor = raw_ostream::GREEN;
+static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 
 llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     unsigned LineNumber, const Preprocessor *PP, const LangOptions &LangOpts,
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 033d21656b1272..1378b3a0812e59 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1306,18 +1306,13 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
     if (!WasPrintable)
       HighlightingEnabled = false;
 
-    // FIXME: I hope we can do this in some nicer way.
     if (HighlightingEnabled) {
-      std::optional<enum raw_ostream::Colors> H;
-      for (auto &P : Styles) {
-        if (P.Start < I && P.End >= I) {
-          H = P.Color;
-          break;
-        }
-      }
+      const auto *CharStyle = llvm::find_if(Styles, [I](const StyleRange &R) {
+        return (R.Start < I && R.End >= I);
+      });
 
-      if (H)
-        OS.changeColor(*H, false);
+      if (CharStyle != Styles.end())
+        OS.changeColor(CharStyle->Color, false);
       else
         OS.resetColor();
     }
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index bc8302d7fb9489..c7ee33e7b31bd0 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -163,6 +163,8 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
     PreambleConditionalStack.startRecording();
 
   MaxTokens = LangOpts.MaxTokens;
+
+  CheckPoints.push_back(nullptr);
 }
 
 Preprocessor::~Preprocessor() {
@@ -862,17 +864,11 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
   return true;
 }
 
+static constexpr ptrdiff_t CheckPointLimit = 1024 * 8;
 void Preprocessor::saveCheckPoint(const char *P) {
-  static constexpr ptrdiff_t Limit = 1024 * 8;
-  if (CheckPoints.empty()) {
-    CheckPoints.push_back(P);
-    return;
-  }
-
-  const char *Cur = CheckPoints.back();
-  if (Cur == P)
-    return;
-  if ((P - Cur) > Limit)
+  assert(!CheckPoints.empty());
+  assert(CheckPoints.back() != P);
+  if ((P - CheckPoints.back()) > CheckPointLimit)
     CheckPoints.push_back(P);
 }
 

>From ef1a8d5eac0c0f7665d3fea2eb9c885bb64ebc76 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 17 Oct 2023 07:44:11 +0200
Subject: [PATCH 18/35] Rename lexer API

---
 clang/include/clang/Lex/Preprocessor.h        | 9 ++++++---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 2 +-
 clang/lib/Lex/Preprocessor.cpp                | 4 ++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 07c44794520f66..05bf87d584b8db 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -141,10 +141,7 @@ class Preprocessor {
   std::unique_ptr<ScratchBuffer> ScratchBuf;
   HeaderSearch      &HeaderInfo;
   ModuleLoader      &TheModuleLoader;
-
   llvm::SmallVector<const char *> CheckPoints;
-  void saveCheckPoint(const char *P);
-  const char *getSaveFor(const char *S) const;
 
   /// External source of macros.
   ExternalPreprocessorSource *ExternalSource;
@@ -1323,6 +1320,11 @@ class Preprocessor {
     OnToken = std::move(F);
   }
 
+  /// Returns a pointer into the main file's buffer that's guaranteed to be
+  /// after a fully lexed token. This can be used to partially lex a file
+  /// without starting in the middle of a token.
+  const char *getCompleteTokenCheckpoint(const char *P) const;
+
   void setPreprocessToken(bool Preprocess) { PreprocessToken = Preprocess; }
 
   bool isMacroDefined(StringRef Id) {
@@ -2263,6 +2265,7 @@ class Preprocessor {
 
   const char *getCurLexerEndPos();
   void diagnoseMissingHeaderInUmbrellaDir(const Module &Mod);
+  void saveCheckPoint(const char *P);
 
 public:
   void PoisonSEHIdentifiers(bool Poison = true); // Borland
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 715c113d519438..1292469f80a073 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -67,7 +67,7 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
   L.SetKeepWhitespaceMode(true);
 
   // Seek to the last save point before the start of the line.
-  if (const char *Save = PP->getSaveFor(LineStart);
+  if (const char *Save = PP->getCompleteTokenCheckpoint(LineStart);
       Buff->getBufferStart() <= Save && Save < Buff->getBufferEnd()) {
     size_t Offset = Save - Buff->getBufferStart();
     assert(Save >= Buff->getBufferStart());
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index c7ee33e7b31bd0..7dc96e686d0619 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -874,11 +874,11 @@ void Preprocessor::saveCheckPoint(const char *P) {
 
 /// We want to always return a value lower than \p S.
 /// If there is no such checkpoint, return nullptr.
-const char *Preprocessor::getSaveFor(const char *S) const {
+const char *Preprocessor::getCompleteTokenCheckpoint(const char *P) const {
   const char *Result = nullptr;
   for (ssize_t I = CheckPoints.size() - 1; I >= 0; --I) {
     const char *C = CheckPoints[I];
-    if (C <= S) {
+    if (C <= P) {
       Result = C;
       break;
     }

>From 20d849a9f200b71c2e552fdd53b568026be782a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Fri, 20 Oct 2023 09:43:46 +0200
Subject: [PATCH 19/35] Just don't highlight in files >1MB

---
 clang/include/clang/Lex/Preprocessor.h        |  7 ----
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 34 ++++++++-----------
 clang/lib/Lex/Preprocessor.cpp                | 25 --------------
 3 files changed, 15 insertions(+), 51 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 05bf87d584b8db..b1c2807e35a314 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -141,7 +141,6 @@ class Preprocessor {
   std::unique_ptr<ScratchBuffer> ScratchBuf;
   HeaderSearch      &HeaderInfo;
   ModuleLoader      &TheModuleLoader;
-  llvm::SmallVector<const char *> CheckPoints;
 
   /// External source of macros.
   ExternalPreprocessorSource *ExternalSource;
@@ -1320,11 +1319,6 @@ class Preprocessor {
     OnToken = std::move(F);
   }
 
-  /// Returns a pointer into the main file's buffer that's guaranteed to be
-  /// after a fully lexed token. This can be used to partially lex a file
-  /// without starting in the middle of a token.
-  const char *getCompleteTokenCheckpoint(const char *P) const;
-
   void setPreprocessToken(bool Preprocess) { PreprocessToken = Preprocess; }
 
   bool isMacroDefined(StringRef Id) {
@@ -2265,7 +2259,6 @@ class Preprocessor {
 
   const char *getCurLexerEndPos();
   void diagnoseMissingHeaderInUmbrellaDir(const Module &Mod);
-  void saveCheckPoint(const char *P);
 
 public:
   void PoisonSEHIdentifiers(bool Poison = true); // Borland
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 1292469f80a073..a1ca6822732346 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -18,9 +18,15 @@
 
 using namespace clang;
 
-static constexpr raw_ostream::Colors CommentColor = raw_ostream::MAGENTA;
-static constexpr raw_ostream::Colors LiteralColor = raw_ostream::RED;
+// Magenta is taken for 'warning'. Red is already 'error' and 'cya'
+// is already taken for 'note'. Green is already used to underline
+// source ranges. White and black are bad because of the usual
+// terminal backgrounds. Which leaves us only with TWO options.
+static constexpr raw_ostream::Colors CommentColor = raw_ostream::YELLOW;
+static constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
 static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
+/// Maximum size of file we still highlight.
+static constexpr size_t MaxBufferSize = 1024 * 1024; // 1MB.
 
 llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     unsigned LineNumber, const Preprocessor *PP, const LangOptions &LangOpts,
@@ -35,6 +41,13 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
   if (PP->getIdentifierTable().getExternalIdentifierLookup())
     return {};
 
+  auto Buff = SM.getBufferOrNone(FID);
+  if (!Buff || Buff->getBufferSize() > MaxBufferSize)
+    return {};
+
+  Lexer L = Lexer(FID, *Buff, SM, LangOpts);
+  L.SetKeepWhitespaceMode(true);
+
   size_t NTokens = 0;
   // Classify the given token and append it to the given vector.
   auto appendStyle = [PP, &LangOpts](llvm::SmallVector<StyleRange> &Vec,
@@ -60,23 +73,6 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     }
   };
 
-  // Figure out where to start lexing from.
-  auto Buff = SM.getBufferOrNone(FID);
-  assert(Buff);
-  Lexer L = Lexer(FID, *Buff, SM, LangOpts);
-  L.SetKeepWhitespaceMode(true);
-
-  // Seek to the last save point before the start of the line.
-  if (const char *Save = PP->getCompleteTokenCheckpoint(LineStart);
-      Buff->getBufferStart() <= Save && Save < Buff->getBufferEnd()) {
-    size_t Offset = Save - Buff->getBufferStart();
-    assert(Save >= Buff->getBufferStart());
-    assert(Save <= Buff->getBufferEnd());
-    assert(Save <= LineStart);
-
-    L.seek(Offset, /*IsAtStartOfLine=*/true);
-  }
-
   llvm::SmallVector<StyleRange> LineRanges;
   bool Stop = false;
   while (!Stop) {
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 7dc96e686d0619..64f54c6fc6382f 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -163,8 +163,6 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
     PreambleConditionalStack.startRecording();
 
   MaxTokens = LangOpts.MaxTokens;
-
-  CheckPoints.push_back(nullptr);
 }
 
 Preprocessor::~Preprocessor() {
@@ -864,29 +862,6 @@ bool Preprocessor::HandleIdentifier(Token &Identifier) {
   return true;
 }
 
-static constexpr ptrdiff_t CheckPointLimit = 1024 * 8;
-void Preprocessor::saveCheckPoint(const char *P) {
-  assert(!CheckPoints.empty());
-  assert(CheckPoints.back() != P);
-  if ((P - CheckPoints.back()) > CheckPointLimit)
-    CheckPoints.push_back(P);
-}
-
-/// We want to always return a value lower than \p S.
-/// If there is no such checkpoint, return nullptr.
-const char *Preprocessor::getCompleteTokenCheckpoint(const char *P) const {
-  const char *Result = nullptr;
-  for (ssize_t I = CheckPoints.size() - 1; I >= 0; --I) {
-    const char *C = CheckPoints[I];
-    if (C <= P) {
-      Result = C;
-      break;
-    }
-  }
-
-  return Result;
-}
-
 void Preprocessor::Lex(Token &Result) {
   ++LexLevel;
 

>From a7fa7a33fcd93a2c6e7986df8c78fd37f58f17b0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 29 Oct 2023 09:22:47 +0100
Subject: [PATCH 20/35] Fix a typo

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index a1ca6822732346..90ab5a4927efb3 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -18,7 +18,7 @@
 
 using namespace clang;
 
-// Magenta is taken for 'warning'. Red is already 'error' and 'cya'
+// Magenta is taken for 'warning'. Red is already 'error' and 'cyan'
 // is already taken for 'note'. Green is already used to underline
 // source ranges. White and black are bad because of the usual
 // terminal backgrounds. Which leaves us only with TWO options.

>From 6783eff333124886a01a7d9771b0bb8c93adbc3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Wed, 8 Nov 2023 11:36:51 +0100
Subject: [PATCH 21/35] Address review comment

---
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 90ab5a4927efb3..73d3b9f195bdce 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -45,7 +45,7 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
   if (!Buff || Buff->getBufferSize() > MaxBufferSize)
     return {};
 
-  Lexer L = Lexer(FID, *Buff, SM, LangOpts);
+  Lexer L{FID, *Buff, SM, LangOpts};
   L.SetKeepWhitespaceMode(true);
 
   size_t NTokens = 0;

>From 4f38506c083d18c10adf1f47029fe1c30b279bbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 12 Nov 2023 08:01:25 +0100
Subject: [PATCH 22/35] Highlight all requested lines in one go.

---
 .../clang/Frontend/CodeSnippetHighlighter.h   |   9 +-
 clang/include/clang/Frontend/TextDiagnostic.h |   2 +-
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 100 +++++++-----------
 clang/lib/Frontend/TextDiagnostic.cpp         |  12 ++-
 4 files changed, 49 insertions(+), 74 deletions(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index cb3c96f6929379..89cdb27bccb574 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -34,10 +34,11 @@ class CodeSnippetHighlighter final {
   /// Produce StyleRanges for the given line.
   /// The returned vector contains non-overlapping style ranges. They are sorted
   /// from beginning of the line to the end.
-  llvm::SmallVector<StyleRange>
-  highlightLine(unsigned LineNumber, const Preprocessor *PP,
-                const LangOptions &LangOpts, FileID FID,
-                const SourceManager &SM, const char *LineStart);
+  // llvm::SmallVector<StyleRange>
+  std::unique_ptr<llvm::SmallVector<StyleRange>[]>
+  highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
+                 const Preprocessor *PP, const LangOptions &LangOpts,
+                 FileID FID, const SourceManager &SM);
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index ecd5bb4a4f568d..7d1cebabf4c15e 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -106,7 +106,7 @@ class TextDiagnostic : public DiagnosticRenderer {
 
   void emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth,
                    FileID FID, const SourceManager &SM, unsigned LineNo,
-                   unsigned DisplayLineNo, const char *LineStart);
+                   unsigned DisplayLineNo, ArrayRef<StyleRange> Styles);
 
   void emitParseableFixits(ArrayRef<FixItHint> Hints, const SourceManager &SM);
 };
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
index 73d3b9f195bdce..6d6958b10c05c4 100644
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
@@ -28,27 +28,30 @@ static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
 /// Maximum size of file we still highlight.
 static constexpr size_t MaxBufferSize = 1024 * 1024; // 1MB.
 
-llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
-    unsigned LineNumber, const Preprocessor *PP, const LangOptions &LangOpts,
-    FileID FID, const SourceManager &SM, const char *LineStart) {
-  std::chrono::steady_clock::time_point begin =
-      std::chrono::steady_clock::now();
+std::unique_ptr<llvm::SmallVector<StyleRange>[]>
+CodeSnippetHighlighter::highlightLines(unsigned StartLineNumber,
+                                       unsigned EndLineNumber,
+                                       const Preprocessor *PP,
+                                       const LangOptions &LangOpts, FileID FID,
+                                       const SourceManager &SM) {
+  assert(StartLineNumber <= EndLineNumber);
+  auto SnippetRanges = std::make_unique<llvm::SmallVector<StyleRange>[]>(
+      EndLineNumber - StartLineNumber + 1);
 
   if (!PP)
-    return {};
+    return SnippetRanges;
 
   // Might cause emission of another diagnostic.
   if (PP->getIdentifierTable().getExternalIdentifierLookup())
-    return {};
+    return SnippetRanges;
 
   auto Buff = SM.getBufferOrNone(FID);
   if (!Buff || Buff->getBufferSize() > MaxBufferSize)
-    return {};
+    return SnippetRanges;
 
   Lexer L{FID, *Buff, SM, LangOpts};
   L.SetKeepWhitespaceMode(true);
 
-  size_t NTokens = 0;
   // Classify the given token and append it to the given vector.
   auto appendStyle = [PP, &LangOpts](llvm::SmallVector<StyleRange> &Vec,
                                      const Token &T, unsigned Start,
@@ -73,10 +76,9 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     }
   };
 
-  llvm::SmallVector<StyleRange> LineRanges;
+
   bool Stop = false;
   while (!Stop) {
-    ++NTokens;
     Token T;
     Stop = L.LexFromRawLexer(T);
     if (T.is(tok::unknown))
@@ -88,34 +90,33 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
       continue;
 
     bool Invalid = false;
-    unsigned EndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid) - 1;
-    if (Invalid)
+    unsigned TokenEndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid);
+    if (Invalid || TokenEndLine < StartLineNumber)
       continue;
 
-    if (EndLine < LineNumber)
-      continue;
-    unsigned StartLine =
-        SM.getSpellingLineNumber(T.getLocation(), &Invalid) - 1;
+    assert(TokenEndLine >= StartLineNumber);
+
+    unsigned TokenStartLine =
+        SM.getSpellingLineNumber(T.getLocation(), &Invalid);
     if (Invalid)
       continue;
-    if (StartLine > LineNumber)
+    // If this happens, we're done.
+    if (TokenStartLine > EndLineNumber)
       break;
 
-    // Must have an intersection at this point
-    assert(StartLine <= LineNumber && EndLine >= LineNumber);
-
     unsigned StartCol =
         SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
     if (Invalid)
       continue;
 
     // Simple tokens.
-    if (StartLine == EndLine) {
+    if (TokenStartLine == TokenEndLine) {
+      llvm::SmallVector<StyleRange> &LineRanges =
+          SnippetRanges[TokenStartLine - StartLineNumber];
       appendStyle(LineRanges, T, StartCol, T.getLength());
       continue;
     }
-    unsigned NumLines = EndLine - StartLine;
-    assert(NumLines >= 1);
+    assert((TokenEndLine - TokenStartLine) >= 1);
 
     // For tokens that span multiple lines (think multiline comments), we
     // divide them into multiple StyleRanges.
@@ -125,23 +126,26 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
 
     std::string Spelling = Lexer::getSpelling(T, SM, LangOpts);
 
-    unsigned L = 0;
+    unsigned L = TokenStartLine;
     unsigned LineLength = 0;
     for (unsigned I = 0; I <= Spelling.size(); ++I) {
       // This line is done.
       if (isVerticalWhitespace(Spelling[I]) || I == Spelling.size()) {
-        if (StartLine + L == LineNumber) {
-          if (L == 0) // First line
+        llvm::SmallVector<StyleRange> &LineRanges =
+            SnippetRanges[L - StartLineNumber];
+
+        if (L == StartLineNumber) {
+          if (L == TokenStartLine) // First line
             appendStyle(LineRanges, T, StartCol, LineLength);
-          else if (L == NumLines) // Last line
+          else if (L == TokenEndLine) // Last line
             appendStyle(LineRanges, T, 0, EndCol);
           else
             appendStyle(LineRanges, T, 0, LineLength);
-
-          // We only do one line, so we're done.
-          break;
         }
+
         ++L;
+        if (L > EndLineNumber)
+          break;
         LineLength = 0;
         continue;
       }
@@ -149,37 +153,5 @@ llvm::SmallVector<StyleRange> CodeSnippetHighlighter::highlightLine(
     }
   }
 
-#if 0
-  llvm::errs() << "--\nLine Style info: \n";
-  //int I = 0;
-  //for (std::vector<StyleRange> &Line : Lines) {
-    //llvm::errs() << I << ": ";
-    for (const auto &R : LineRanges) {
-      llvm::errs() << "{" << R.Start << ", " << R.End << "}, ";
-    }
-    llvm::errs() << "\n";
-
-    //++I;
-  //}
-#endif
-
-#if 0
-  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
-  llvm::errs() << "Lexed " << NTokens << " Tokens\n";
-  llvm::errs() << "That took "
-               << std::chrono::duration_cast<std::chrono::microseconds>(end -
-                                                                        begin)
-                      .count()
-               << " microseconds\n";
-  llvm::errs() << "That took "
-               << std::chrono::duration_cast<std::chrono::milliseconds>(end -
-                                                                        begin)
-                      .count()
-               << " milliseconds\n";
-  llvm::errs()
-      << "That took "
-      << std::chrono::duration_cast<std::chrono::seconds>(end - begin).count()
-      << " seconds\n";
-#endif
-  return LineRanges;
+  return SnippetRanges;
 }
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 1378b3a0812e59..8230979c62161a 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1186,6 +1186,11 @@ void TextDiagnostic::emitSnippetAndCaret(
   SmallVector<LineRange> LineRanges =
       prepareAndFilterRanges(Ranges, SM, Lines, FID, LangOpts);
 
+  // Prepare source highlighting information for the lines we're about to emit.
+  std::unique_ptr<llvm::SmallVector<StyleRange>[]> SourceStyles =
+      SnippetHighlighter.highlightLines(Lines.first, Lines.second, PP, LangOpts,
+                                        FID, SM);
+
   for (unsigned LineNo = Lines.first; LineNo != Lines.second + 1;
        ++LineNo, ++DisplayLineNo) {
     // Rewind from the current position to the start of the line.
@@ -1250,7 +1255,7 @@ void TextDiagnostic::emitSnippetAndCaret(
 
     // Emit what we have computed.
     emitSnippet(SourceLine, MaxLineNoDisplayWidth, FID, SM, LineNo,
-                DisplayLineNo, LineStart);
+                DisplayLineNo, SourceStyles[LineNo - Lines.first]);
 
     if (!CaretLine.empty()) {
       indentForLineNumbers();
@@ -1282,10 +1287,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
                                  unsigned MaxLineNoDisplayWidth, FileID FID,
                                  const SourceManager &SM, unsigned LineNo,
                                  unsigned DisplayLineNo,
-                                 const char *LineStart) {
-  llvm::SmallVector<StyleRange> Styles = SnippetHighlighter.highlightLine(
-      LineNo - 1, PP, LangOpts, FID, SM, LineStart);
-
+                                 ArrayRef<StyleRange> Styles) {
   // Emit line number.
   if (MaxLineNoDisplayWidth > 0) {
     unsigned LineNoDisplayWidth = getNumDisplayWidth(DisplayLineNo);

>From 0abb842ce7ac1c64249ff64fc60ee59d526e71a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 12 Nov 2023 08:19:30 +0100
Subject: [PATCH 23/35] Remove a leftover comment

---
 clang/include/clang/Frontend/CodeSnippetHighlighter.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
index 89cdb27bccb574..6aa4497182c84a 100644
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
@@ -34,7 +34,6 @@ class CodeSnippetHighlighter final {
   /// Produce StyleRanges for the given line.
   /// The returned vector contains non-overlapping style ranges. They are sorted
   /// from beginning of the line to the end.
-  // llvm::SmallVector<StyleRange>
   std::unique_ptr<llvm::SmallVector<StyleRange>[]>
   highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
                  const Preprocessor *PP, const LangOptions &LangOpts,

>From bba5aa938291104c8e4ac17b4931c35f79010b4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 12 Nov 2023 09:28:23 +0100
Subject: [PATCH 24/35] Remove CodeSnippetHighlighter again

It was a class with only one function, used in only one place. Just
merge it into TextDiagnostic.
---
 .../include/clang/Basic/DiagnosticOptions.def |   3 +
 clang/include/clang/Basic/DiagnosticOptions.h |   1 +
 clang/include/clang/Driver/Options.td         |   4 +
 .../clang/Frontend/CodeSnippetHighlighter.h   |  45 -----
 clang/include/clang/Frontend/TextDiagnostic.h |  15 +-
 clang/lib/Frontend/CMakeLists.txt             |   1 -
 clang/lib/Frontend/CodeSnippetHighlighter.cpp | 157 ------------------
 clang/lib/Frontend/TextDiagnostic.cpp         | 149 ++++++++++++++++-
 8 files changed, 160 insertions(+), 215 deletions(-)
 delete mode 100644 clang/include/clang/Frontend/CodeSnippetHighlighter.h
 delete mode 100644 clang/lib/Frontend/CodeSnippetHighlighter.cpp

diff --git a/clang/include/clang/Basic/DiagnosticOptions.def b/clang/include/clang/Basic/DiagnosticOptions.def
index 6d0c1b14acc120..553f6476f07b33 100644
--- a/clang/include/clang/Basic/DiagnosticOptions.def
+++ b/clang/include/clang/Basic/DiagnosticOptions.def
@@ -96,6 +96,9 @@ VALUE_DIAGOPT(ShowLineNumbers, 1, DefaultShowLineNumbers)
 VALUE_DIAGOPT(TabStop, 32, DefaultTabStop) /// The distance between tab stops.
 /// Column limit for formatting message diagnostics, or 0 if unused.
 VALUE_DIAGOPT(MessageLength, 32, 0)
+/// Default maximum file size to highlight code snippets for, in bytes.
+VALUE_DIAGOPT(MaxHighlightFileSize, 32, DefaultMaxHighlightFileSize)
+
 
 DIAGOPT(ShowSafeBufferUsageSuggestions, 1, 0)
 
diff --git a/clang/include/clang/Basic/DiagnosticOptions.h b/clang/include/clang/Basic/DiagnosticOptions.h
index 099982c3bdd5a0..4be540b559a3ae 100644
--- a/clang/include/clang/Basic/DiagnosticOptions.h
+++ b/clang/include/clang/Basic/DiagnosticOptions.h
@@ -87,6 +87,7 @@ class DiagnosticOptions : public RefCountedBase<DiagnosticOptions>{
     DefaultSpellCheckingLimit = 50,
     DefaultSnippetLineLimit = 16,
     DefaultShowLineNumbers = 1,
+    DefaultMaxHighlightFileSize = 1024 * 1024
   };
 
   // Define simple diagnostic options (with no accessors).
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d9d6ce81b4d84a..3ab8f733bc6229 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -7099,6 +7099,10 @@ def fno_diagnostics_use_presumed_location : Flag<["-"], "fno-diagnostics-use-pre
 def ftabstop : Separate<["-"], "ftabstop">, MetaVarName<"<N>">,
   HelpText<"Set the tab stop distance.">,
   MarshallingInfoInt<DiagnosticOpts<"TabStop">, "DiagnosticOptions::DefaultTabStop">;
+def fmax_highlight_file_size : Separate<["-"], "fmax-highlight-file-size">, MetaVarName<"<N>">,
+  HelpText<"Set the tab stop distance.">,
+  MarshallingInfoInt<DiagnosticOpts<"MaxHighlightFileSize">, "DiagnosticOptions::DefaultMaxHighlightFileSize">;
+
 def ferror_limit : Separate<["-"], "ferror-limit">, MetaVarName<"<N>">,
   HelpText<"Set the maximum number of errors to emit before stopping (0 = no limit).">,
   MarshallingInfoInt<DiagnosticOpts<"ErrorLimit">>;
diff --git a/clang/include/clang/Frontend/CodeSnippetHighlighter.h b/clang/include/clang/Frontend/CodeSnippetHighlighter.h
deleted file mode 100644
index 6aa4497182c84a..00000000000000
--- a/clang/include/clang/Frontend/CodeSnippetHighlighter.h
+++ /dev/null
@@ -1,45 +0,0 @@
-//===--- CodeSnippetHighlighter.h - Code snippet highlighting ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_FRONTEND_CODESNIPPETHIGHLIGHTER_H
-#define LLVM_CLANG_FRONTEND_CODESNIPPETHIGHLIGHTER_H
-
-#include "clang/Basic/LangOptions.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace clang {
-
-struct StyleRange {
-  unsigned Start;
-  unsigned End;
-  enum llvm::raw_ostream::Colors Color;
-  StyleRange(unsigned S, unsigned E, enum llvm::raw_ostream::Colors C)
-      : Start(S), End(E), Color(C){};
-};
-
-class Preprocessor;
-class FileID;
-class SourceManager;
-
-class CodeSnippetHighlighter final {
-public:
-  CodeSnippetHighlighter() = default;
-
-  /// Produce StyleRanges for the given line.
-  /// The returned vector contains non-overlapping style ranges. They are sorted
-  /// from beginning of the line to the end.
-  std::unique_ptr<llvm::SmallVector<StyleRange>[]>
-  highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
-                 const Preprocessor *PP, const LangOptions &LangOpts,
-                 FileID FID, const SourceManager &SM);
-};
-
-} // namespace clang
-
-#endif
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 7d1cebabf4c15e..05ec753289d14f 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H
 #define LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H
 
-#include "clang/Frontend/CodeSnippetHighlighter.h"
 #include "clang/Frontend/DiagnosticRenderer.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 /// Class to encapsulate the logic for formatting and printing a textual
@@ -34,7 +34,6 @@ namespace clang {
 class TextDiagnostic : public DiagnosticRenderer {
   raw_ostream &OS;
   const Preprocessor *PP;
-  CodeSnippetHighlighter SnippetHighlighter;
 
 public:
   TextDiagnostic(raw_ostream &OS, const LangOptions &LangOpts,
@@ -42,6 +41,14 @@ class TextDiagnostic : public DiagnosticRenderer {
 
   ~TextDiagnostic() override;
 
+  struct StyleRange {
+    unsigned Start;
+    unsigned End;
+    enum llvm::raw_ostream::Colors Color;
+    StyleRange(unsigned S, unsigned E, enum llvm::raw_ostream::Colors C)
+        : Start(S), End(E), Color(C){};
+  };
+
   /// Print the diagonstic level to a raw_ostream.
   ///
   /// This is a static helper that handles colorizing the level and formatting
@@ -105,8 +112,8 @@ class TextDiagnostic : public DiagnosticRenderer {
                            ArrayRef<FixItHint> Hints);
 
   void emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth,
-                   FileID FID, const SourceManager &SM, unsigned LineNo,
-                   unsigned DisplayLineNo, ArrayRef<StyleRange> Styles);
+                   unsigned LineNo, unsigned DisplayLineNo,
+                   ArrayRef<StyleRange> Styles);
 
   void emitParseableFixits(ArrayRef<FixItHint> Hints, const SourceManager &SM);
 };
diff --git a/clang/lib/Frontend/CMakeLists.txt b/clang/lib/Frontend/CMakeLists.txt
index db0dea04514b0c..a9166672088459 100644
--- a/clang/lib/Frontend/CMakeLists.txt
+++ b/clang/lib/Frontend/CMakeLists.txt
@@ -42,7 +42,6 @@ add_clang_library(clangFrontend
   TextDiagnosticPrinter.cpp
   VerifyDiagnosticConsumer.cpp
   InterfaceStubFunctionsConsumer.cpp
-  CodeSnippetHighlighter.cpp
 
   DEPENDS
   ClangDriverOptions
diff --git a/clang/lib/Frontend/CodeSnippetHighlighter.cpp b/clang/lib/Frontend/CodeSnippetHighlighter.cpp
deleted file mode 100644
index 6d6958b10c05c4..00000000000000
--- a/clang/lib/Frontend/CodeSnippetHighlighter.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//===-- CodeSnippetHighlighter.cpp - Code snippet highlighting --*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "clang/Frontend/CodeSnippetHighlighter.h"
-#include "clang/Basic/CharInfo.h"
-#include "clang/Basic/DiagnosticOptions.h"
-#include "clang/Basic/SourceManager.h"
-#include "clang/Lex/Lexer.h"
-#include "clang/Lex/Preprocessor.h"
-#include "clang/Lex/PreprocessorOptions.h"
-#include "llvm/Support/raw_ostream.h"
-#include <chrono>
-
-using namespace clang;
-
-// Magenta is taken for 'warning'. Red is already 'error' and 'cyan'
-// is already taken for 'note'. Green is already used to underline
-// source ranges. White and black are bad because of the usual
-// terminal backgrounds. Which leaves us only with TWO options.
-static constexpr raw_ostream::Colors CommentColor = raw_ostream::YELLOW;
-static constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
-static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
-/// Maximum size of file we still highlight.
-static constexpr size_t MaxBufferSize = 1024 * 1024; // 1MB.
-
-std::unique_ptr<llvm::SmallVector<StyleRange>[]>
-CodeSnippetHighlighter::highlightLines(unsigned StartLineNumber,
-                                       unsigned EndLineNumber,
-                                       const Preprocessor *PP,
-                                       const LangOptions &LangOpts, FileID FID,
-                                       const SourceManager &SM) {
-  assert(StartLineNumber <= EndLineNumber);
-  auto SnippetRanges = std::make_unique<llvm::SmallVector<StyleRange>[]>(
-      EndLineNumber - StartLineNumber + 1);
-
-  if (!PP)
-    return SnippetRanges;
-
-  // Might cause emission of another diagnostic.
-  if (PP->getIdentifierTable().getExternalIdentifierLookup())
-    return SnippetRanges;
-
-  auto Buff = SM.getBufferOrNone(FID);
-  if (!Buff || Buff->getBufferSize() > MaxBufferSize)
-    return SnippetRanges;
-
-  Lexer L{FID, *Buff, SM, LangOpts};
-  L.SetKeepWhitespaceMode(true);
-
-  // Classify the given token and append it to the given vector.
-  auto appendStyle = [PP, &LangOpts](llvm::SmallVector<StyleRange> &Vec,
-                                     const Token &T, unsigned Start,
-                                     unsigned Length) -> void {
-    if (T.is(tok::raw_identifier)) {
-      StringRef RawIdent = T.getRawIdentifier();
-      // Special case true/false/nullptr literals, since they will otherwise be
-      // treated as keywords.
-      if (RawIdent == "true" || RawIdent == "false" || RawIdent == "nullptr") {
-        Vec.emplace_back(Start, Start + Length, LiteralColor);
-      } else {
-        const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
-        assert(II);
-        if (II->isKeyword(LangOpts))
-          Vec.emplace_back(Start, Start + Length, KeywordColor);
-      }
-    } else if (tok::isLiteral(T.getKind())) {
-      Vec.emplace_back(Start, Start + Length, LiteralColor);
-    } else {
-      assert(T.is(tok::comment));
-      Vec.emplace_back(Start, Start + Length, CommentColor);
-    }
-  };
-
-
-  bool Stop = false;
-  while (!Stop) {
-    Token T;
-    Stop = L.LexFromRawLexer(T);
-    if (T.is(tok::unknown))
-      continue;
-
-    // We are only interested in identifiers, literals and comments.
-    if (!T.is(tok::raw_identifier) && !T.is(tok::comment) &&
-        !tok::isLiteral(T.getKind()))
-      continue;
-
-    bool Invalid = false;
-    unsigned TokenEndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid);
-    if (Invalid || TokenEndLine < StartLineNumber)
-      continue;
-
-    assert(TokenEndLine >= StartLineNumber);
-
-    unsigned TokenStartLine =
-        SM.getSpellingLineNumber(T.getLocation(), &Invalid);
-    if (Invalid)
-      continue;
-    // If this happens, we're done.
-    if (TokenStartLine > EndLineNumber)
-      break;
-
-    unsigned StartCol =
-        SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
-    if (Invalid)
-      continue;
-
-    // Simple tokens.
-    if (TokenStartLine == TokenEndLine) {
-      llvm::SmallVector<StyleRange> &LineRanges =
-          SnippetRanges[TokenStartLine - StartLineNumber];
-      appendStyle(LineRanges, T, StartCol, T.getLength());
-      continue;
-    }
-    assert((TokenEndLine - TokenStartLine) >= 1);
-
-    // For tokens that span multiple lines (think multiline comments), we
-    // divide them into multiple StyleRanges.
-    unsigned EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1;
-    if (Invalid)
-      continue;
-
-    std::string Spelling = Lexer::getSpelling(T, SM, LangOpts);
-
-    unsigned L = TokenStartLine;
-    unsigned LineLength = 0;
-    for (unsigned I = 0; I <= Spelling.size(); ++I) {
-      // This line is done.
-      if (isVerticalWhitespace(Spelling[I]) || I == Spelling.size()) {
-        llvm::SmallVector<StyleRange> &LineRanges =
-            SnippetRanges[L - StartLineNumber];
-
-        if (L == StartLineNumber) {
-          if (L == TokenStartLine) // First line
-            appendStyle(LineRanges, T, StartCol, LineLength);
-          else if (L == TokenEndLine) // Last line
-            appendStyle(LineRanges, T, 0, EndCol);
-          else
-            appendStyle(LineRanges, T, 0, LineLength);
-        }
-
-        ++L;
-        if (L > EndLineNumber)
-          break;
-        LineLength = 0;
-        continue;
-      }
-      ++LineLength;
-    }
-  }
-
-  return SnippetRanges;
-}
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 8230979c62161a..3b012c90ec8966 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -11,7 +11,6 @@
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
-#include "clang/Frontend/CodeSnippetHighlighter.h"
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/SmallString.h"
@@ -43,6 +42,16 @@ static const enum raw_ostream::Colors fatalColor = raw_ostream::RED;
 static const enum raw_ostream::Colors savedColor =
   raw_ostream::SAVEDCOLOR;
 
+// Magenta is taken for 'warning'. Red is already 'error' and 'cyan'
+// is already taken for 'note'. Green is already used to underline
+// source ranges. White and black are bad because of the usual
+// terminal backgrounds. Which leaves us only with TWO options.
+static constexpr raw_ostream::Colors CommentColor = raw_ostream::YELLOW;
+static constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
+static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
+/// Maximum size of file we still highlight.
+static constexpr size_t MaxBufferSize = 1024 * 1024; // 1MB.
+
 /// Add highlights to differences in template strings.
 static void applyTemplateHighlighting(raw_ostream &OS, StringRef Str,
                                       bool &Normal, bool Bold) {
@@ -1114,6 +1123,132 @@ prepareAndFilterRanges(const SmallVectorImpl<CharSourceRange> &Ranges,
   return LineRanges;
 }
 
+std::unique_ptr<llvm::SmallVector<TextDiagnostic::StyleRange>[]>
+highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
+               const Preprocessor *PP, const LangOptions &LangOpts, FileID FID,
+               const SourceManager &SM) {
+  assert(StartLineNumber <= EndLineNumber);
+  auto SnippetRanges =
+      std::make_unique<llvm::SmallVector<TextDiagnostic::StyleRange>[]>(
+          EndLineNumber - StartLineNumber + 1);
+
+  if (!PP)
+    return SnippetRanges;
+
+  // Might cause emission of another diagnostic.
+  if (PP->getIdentifierTable().getExternalIdentifierLookup())
+    return SnippetRanges;
+
+  auto Buff = SM.getBufferOrNone(FID);
+  if (!Buff || Buff->getBufferSize() > MaxBufferSize)
+    return SnippetRanges;
+
+  Lexer L{FID, *Buff, SM, LangOpts};
+  L.SetKeepWhitespaceMode(true);
+
+  // Classify the given token and append it to the given vector.
+  auto appendStyle =
+      [PP, &LangOpts](llvm::SmallVector<TextDiagnostic::StyleRange> &Vec,
+                      const Token &T, unsigned Start, unsigned Length) -> void {
+    if (T.is(tok::raw_identifier)) {
+      StringRef RawIdent = T.getRawIdentifier();
+      // Special case true/false/nullptr literals, since they will otherwise be
+      // treated as keywords.
+      if (RawIdent == "true" || RawIdent == "false" || RawIdent == "nullptr") {
+        Vec.emplace_back(Start, Start + Length, LiteralColor);
+      } else {
+        const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
+        assert(II);
+        if (II->isKeyword(LangOpts))
+          Vec.emplace_back(Start, Start + Length, KeywordColor);
+      }
+    } else if (tok::isLiteral(T.getKind())) {
+      Vec.emplace_back(Start, Start + Length, LiteralColor);
+    } else {
+      assert(T.is(tok::comment));
+      Vec.emplace_back(Start, Start + Length, CommentColor);
+    }
+  };
+
+  bool Stop = false;
+  while (!Stop) {
+    Token T;
+    Stop = L.LexFromRawLexer(T);
+    if (T.is(tok::unknown))
+      continue;
+
+    // We are only interested in identifiers, literals and comments.
+    if (!T.is(tok::raw_identifier) && !T.is(tok::comment) &&
+        !tok::isLiteral(T.getKind()))
+      continue;
+
+    bool Invalid = false;
+    unsigned TokenEndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid);
+    if (Invalid || TokenEndLine < StartLineNumber)
+      continue;
+
+    assert(TokenEndLine >= StartLineNumber);
+
+    unsigned TokenStartLine =
+        SM.getSpellingLineNumber(T.getLocation(), &Invalid);
+    if (Invalid)
+      continue;
+    // If this happens, we're done.
+    if (TokenStartLine > EndLineNumber)
+      break;
+
+    unsigned StartCol =
+        SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+
+    // Simple tokens.
+    if (TokenStartLine == TokenEndLine) {
+      llvm::SmallVector<TextDiagnostic::StyleRange> &LineRanges =
+          SnippetRanges[TokenStartLine - StartLineNumber];
+      appendStyle(LineRanges, T, StartCol, T.getLength());
+      continue;
+    }
+    assert((TokenEndLine - TokenStartLine) >= 1);
+
+    // For tokens that span multiple lines (think multiline comments), we
+    // divide them into multiple StyleRanges.
+    unsigned EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+
+    std::string Spelling = Lexer::getSpelling(T, SM, LangOpts);
+
+    unsigned L = TokenStartLine;
+    unsigned LineLength = 0;
+    for (unsigned I = 0; I <= Spelling.size(); ++I) {
+      // This line is done.
+      if (isVerticalWhitespace(Spelling[I]) || I == Spelling.size()) {
+        llvm::SmallVector<TextDiagnostic::StyleRange> &LineRanges =
+            SnippetRanges[L - StartLineNumber];
+
+        if (L == StartLineNumber) {
+          if (L == TokenStartLine) // First line
+            appendStyle(LineRanges, T, StartCol, LineLength);
+          else if (L == TokenEndLine) // Last line
+            appendStyle(LineRanges, T, 0, EndCol);
+          else
+            appendStyle(LineRanges, T, 0, LineLength);
+        }
+
+        ++L;
+        if (L > EndLineNumber)
+          break;
+        LineLength = 0;
+        continue;
+      }
+      ++LineLength;
+    }
+  }
+
+  return SnippetRanges;
+}
+
 /// Emit a code snippet and caret line.
 ///
 /// This routine emits a single line's code snippet and caret line..
@@ -1188,8 +1323,7 @@ void TextDiagnostic::emitSnippetAndCaret(
 
   // Prepare source highlighting information for the lines we're about to emit.
   std::unique_ptr<llvm::SmallVector<StyleRange>[]> SourceStyles =
-      SnippetHighlighter.highlightLines(Lines.first, Lines.second, PP, LangOpts,
-                                        FID, SM);
+      highlightLines(Lines.first, Lines.second, PP, LangOpts, FID, SM);
 
   for (unsigned LineNo = Lines.first; LineNo != Lines.second + 1;
        ++LineNo, ++DisplayLineNo) {
@@ -1254,8 +1388,8 @@ void TextDiagnostic::emitSnippetAndCaret(
     }
 
     // Emit what we have computed.
-    emitSnippet(SourceLine, MaxLineNoDisplayWidth, FID, SM, LineNo,
-                DisplayLineNo, SourceStyles[LineNo - Lines.first]);
+    emitSnippet(SourceLine, MaxLineNoDisplayWidth, LineNo, DisplayLineNo,
+                SourceStyles[LineNo - Lines.first]);
 
     if (!CaretLine.empty()) {
       indentForLineNumbers();
@@ -1284,9 +1418,8 @@ void TextDiagnostic::emitSnippetAndCaret(
 }
 
 void TextDiagnostic::emitSnippet(StringRef SourceLine,
-                                 unsigned MaxLineNoDisplayWidth, FileID FID,
-                                 const SourceManager &SM, unsigned LineNo,
-                                 unsigned DisplayLineNo,
+                                 unsigned MaxLineNoDisplayWidth,
+                                 unsigned LineNo, unsigned DisplayLineNo,
                                  ArrayRef<StyleRange> Styles) {
   // Emit line number.
   if (MaxLineNoDisplayWidth > 0) {

>From 2fcf3c4c970dba3a1d34cf8790609a562e61f7d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 12 Nov 2023 19:07:13 +0100
Subject: [PATCH 25/35] Whitespace cleanup

---
 clang/include/clang/Frontend/TextDiagnostic.h | 1 +
 clang/include/clang/Lex/Preprocessor.h        | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 05ec753289d14f..a2fe8ae995423b 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 namespace clang {
+
 /// Class to encapsulate the logic for formatting and printing a textual
 /// diagnostic message.
 ///
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index b1c2807e35a314..4ec21a8b6be2c8 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -128,7 +128,6 @@ enum MacroUse {
 class Preprocessor {
   friend class VAOptDefinitionContext;
   friend class VariadicMacroScopeGuard;
-  friend class CodeSnippetHighlighter;
 
   llvm::unique_function<void(const clang::Token &)> OnToken;
   std::shared_ptr<PreprocessorOptions> PPOpts;

>From 35adc7982ea97b36a9425b43181e9145be53e8de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 12 Nov 2023 19:14:09 +0100
Subject: [PATCH 26/35] Respect max-highlight-file-size option

---
 clang/include/clang/Driver/Options.td |  8 ++++----
 clang/lib/Driver/ToolChains/Clang.cpp |  1 +
 clang/lib/Frontend/TextDiagnostic.cpp | 10 +++++-----
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 3ab8f733bc6229..f55ad484e92b0f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1842,6 +1842,10 @@ def : Flag<["-"], "fdiagnostics-color">, Group<f_Group>,
 def : Flag<["-"], "fno-diagnostics-color">, Group<f_Group>,
   Visibility<[ClangOption, CLOption, DXCOption]>, Alias<fno_color_diagnostics>;
 def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>;
+def fmax_highlight_file_size_EQ : Joined<["-"], "fmax-highlight-file-size=">,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Maximum file size (in bytes) to still highlight code snippets from.">,
+  MarshallingInfoInt<DiagnosticOpts<"MaxHighlightFileSize">, "DiagnosticOptions::DefaultMaxHighlightFileSize">;
 def fansi_escape_codes : Flag<["-"], "fansi-escape-codes">, Group<f_Group>,
   Visibility<[ClangOption, CLOption, DXCOption, CC1Option]>,
   HelpText<"Use ANSI escape codes for diagnostics">,
@@ -7099,10 +7103,6 @@ def fno_diagnostics_use_presumed_location : Flag<["-"], "fno-diagnostics-use-pre
 def ftabstop : Separate<["-"], "ftabstop">, MetaVarName<"<N>">,
   HelpText<"Set the tab stop distance.">,
   MarshallingInfoInt<DiagnosticOpts<"TabStop">, "DiagnosticOptions::DefaultTabStop">;
-def fmax_highlight_file_size : Separate<["-"], "fmax-highlight-file-size">, MetaVarName<"<N>">,
-  HelpText<"Set the tab stop distance.">,
-  MarshallingInfoInt<DiagnosticOpts<"MaxHighlightFileSize">, "DiagnosticOptions::DefaultMaxHighlightFileSize">;
-
 def ferror_limit : Separate<["-"], "ferror-limit">, MetaVarName<"<N>">,
   HelpText<"Set the maximum number of errors to emit before stopping (0 = no limit).">,
   MarshallingInfoInt<DiagnosticOpts<"ErrorLimit">>;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9edae3fec91a87..04afc9534801da 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7189,6 +7189,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddLastArg(CmdArgs, options::OPT_dI);
 
   Args.AddLastArg(CmdArgs, options::OPT_fmax_tokens_EQ);
+  Args.AddLastArg(CmdArgs, options::OPT_fmax_highlight_file_size_EQ);
 
   // Handle serialized diagnostics.
   if (Arg *A = Args.getLastArg(options::OPT__serialize_diags)) {
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 3b012c90ec8966..82a9bdf1511f64 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -49,8 +49,6 @@ static const enum raw_ostream::Colors savedColor =
 static constexpr raw_ostream::Colors CommentColor = raw_ostream::YELLOW;
 static constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
 static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
-/// Maximum size of file we still highlight.
-static constexpr size_t MaxBufferSize = 1024 * 1024; // 1MB.
 
 /// Add highlights to differences in template strings.
 static void applyTemplateHighlighting(raw_ostream &OS, StringRef Str,
@@ -1125,7 +1123,8 @@ prepareAndFilterRanges(const SmallVectorImpl<CharSourceRange> &Ranges,
 
 std::unique_ptr<llvm::SmallVector<TextDiagnostic::StyleRange>[]>
 highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
-               const Preprocessor *PP, const LangOptions &LangOpts, FileID FID,
+               const Preprocessor *PP, const LangOptions &LangOpts,
+               uint32_t MaxHighlightFileSize, FileID FID,
                const SourceManager &SM) {
   assert(StartLineNumber <= EndLineNumber);
   auto SnippetRanges =
@@ -1140,7 +1139,7 @@ highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
     return SnippetRanges;
 
   auto Buff = SM.getBufferOrNone(FID);
-  if (!Buff || Buff->getBufferSize() > MaxBufferSize)
+  if (!Buff || Buff->getBufferSize() > MaxHighlightFileSize)
     return SnippetRanges;
 
   Lexer L{FID, *Buff, SM, LangOpts};
@@ -1323,7 +1322,8 @@ void TextDiagnostic::emitSnippetAndCaret(
 
   // Prepare source highlighting information for the lines we're about to emit.
   std::unique_ptr<llvm::SmallVector<StyleRange>[]> SourceStyles =
-      highlightLines(Lines.first, Lines.second, PP, LangOpts, FID, SM);
+      highlightLines(Lines.first, Lines.second, PP, LangOpts,
+                     DiagOpts->MaxHighlightFileSize, FID, SM);
 
   for (unsigned LineNo = Lines.first; LineNo != Lines.second + 1;
        ++LineNo, ++DisplayLineNo) {

>From 761b714462ad85e4ea5bb716777f09aface66454 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Sun, 12 Nov 2023 19:39:05 +0100
Subject: [PATCH 27/35] Fix multiline token in the middle of the line range

---
 clang/lib/Frontend/TextDiagnostic.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 82a9bdf1511f64..8a9e860a4b1ae0 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1226,7 +1226,7 @@ highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
         llvm::SmallVector<TextDiagnostic::StyleRange> &LineRanges =
             SnippetRanges[L - StartLineNumber];
 
-        if (L == StartLineNumber) {
+        if (L >= StartLineNumber) {
           if (L == TokenStartLine) // First line
             appendStyle(LineRanges, T, StartCol, LineLength);
           else if (L == TokenEndLine) // Last line

>From 15242baa14a74c891d73b4ea8aaa7010e0016d5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Mon, 13 Nov 2023 09:26:11 +0100
Subject: [PATCH 28/35] Add a doc comment to highlightLines()

---
 clang/lib/Frontend/TextDiagnostic.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 8a9e860a4b1ae0..bd515215e71761 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1121,6 +1121,13 @@ prepareAndFilterRanges(const SmallVectorImpl<CharSourceRange> &Ranges,
   return LineRanges;
 }
 
+/// Creates syntax highlighting information in form of StyleRanges.
+///
+/// The returned unique ptr has always exactly size
+/// (\p EndLineNumber - \p StartLineNumber + 1). Each SmallVector in there
+/// corresponds to syntax highlighting information in one line. In each line,
+/// the StyleRanges are non-overlapping and sorted from start to end of the
+/// line.
 std::unique_ptr<llvm::SmallVector<TextDiagnostic::StyleRange>[]>
 highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
                const Preprocessor *PP, const LangOptions &LangOpts,

>From c7ed5f74bd082e0ccf56060dee78ef5ba03c604f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Mon, 13 Nov 2023 17:06:22 +0100
Subject: [PATCH 29/35] Take MemoryBuffer from existing StringRef

---
 clang/lib/Frontend/TextDiagnostic.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index bd515215e71761..a28ce1e942af49 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1129,10 +1129,10 @@ prepareAndFilterRanges(const SmallVectorImpl<CharSourceRange> &Ranges,
 /// the StyleRanges are non-overlapping and sorted from start to end of the
 /// line.
 std::unique_ptr<llvm::SmallVector<TextDiagnostic::StyleRange>[]>
-highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
-               const Preprocessor *PP, const LangOptions &LangOpts,
-               uint32_t MaxHighlightFileSize, FileID FID,
-               const SourceManager &SM) {
+highlightLines(StringRef FileData, unsigned StartLineNumber,
+               unsigned EndLineNumber, const Preprocessor *PP,
+               const LangOptions &LangOpts, uint32_t MaxHighlightFileSize,
+               FileID FID, const SourceManager &SM) {
   assert(StartLineNumber <= EndLineNumber);
   auto SnippetRanges =
       std::make_unique<llvm::SmallVector<TextDiagnostic::StyleRange>[]>(
@@ -1145,8 +1145,8 @@ highlightLines(unsigned StartLineNumber, unsigned EndLineNumber,
   if (PP->getIdentifierTable().getExternalIdentifierLookup())
     return SnippetRanges;
 
-  auto Buff = SM.getBufferOrNone(FID);
-  if (!Buff || Buff->getBufferSize() > MaxHighlightFileSize)
+  auto Buff = llvm::MemoryBuffer::getMemBuffer(FileData);
+  if (Buff->getBufferSize() > MaxHighlightFileSize)
     return SnippetRanges;
 
   Lexer L{FID, *Buff, SM, LangOpts};
@@ -1329,7 +1329,7 @@ void TextDiagnostic::emitSnippetAndCaret(
 
   // Prepare source highlighting information for the lines we're about to emit.
   std::unique_ptr<llvm::SmallVector<StyleRange>[]> SourceStyles =
-      highlightLines(Lines.first, Lines.second, PP, LangOpts,
+      highlightLines(BufStart, Lines.first, Lines.second, PP, LangOpts,
                      DiagOpts->MaxHighlightFileSize, FID, SM);
 
   for (unsigned LineNo = Lines.first; LineNo != Lines.second + 1;

>From 664e924b69d0f60d16e0a6581ad106beaf368623 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 14 Nov 2023 08:37:13 +0100
Subject: [PATCH 30/35] Test

---
 clang/include/clang/Basic/CharInfo.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index 7d41193835089a..594b15038c0286 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -53,6 +53,10 @@ LLVM_READNONE inline bool isASCII(int64_t c) { return 0 <= c && c <= 127; }
 /// which is [a-zA-Z_].
 LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c,
                                                  bool AllowDollar = false) {
+  if (!AllowDollar) {
+    return c == '_' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+  }
+
   using namespace charinfo;
   if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
     return true;

>From 81f01fc98e3ef2b18de6f0f0716cfe77e922fa5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 14 Nov 2023 10:23:43 +0100
Subject: [PATCH 31/35] Remove some llvm:: namespace specifiers

---
 clang/lib/Frontend/TextDiagnostic.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index a28ce1e942af49..d3891c9bc4e86d 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1135,7 +1135,7 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
                FileID FID, const SourceManager &SM) {
   assert(StartLineNumber <= EndLineNumber);
   auto SnippetRanges =
-      std::make_unique<llvm::SmallVector<TextDiagnostic::StyleRange>[]>(
+      std::make_unique<SmallVector<TextDiagnostic::StyleRange>[]>(
           EndLineNumber - StartLineNumber + 1);
 
   if (!PP)
@@ -1154,7 +1154,7 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
 
   // Classify the given token and append it to the given vector.
   auto appendStyle =
-      [PP, &LangOpts](llvm::SmallVector<TextDiagnostic::StyleRange> &Vec,
+      [PP, &LangOpts](SmallVector<TextDiagnostic::StyleRange> &Vec,
                       const Token &T, unsigned Start, unsigned Length) -> void {
     if (T.is(tok::raw_identifier)) {
       StringRef RawIdent = T.getRawIdentifier();
@@ -1210,7 +1210,7 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
 
     // Simple tokens.
     if (TokenStartLine == TokenEndLine) {
-      llvm::SmallVector<TextDiagnostic::StyleRange> &LineRanges =
+      SmallVector<TextDiagnostic::StyleRange> &LineRanges =
           SnippetRanges[TokenStartLine - StartLineNumber];
       appendStyle(LineRanges, T, StartCol, T.getLength());
       continue;
@@ -1230,7 +1230,7 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
     for (unsigned I = 0; I <= Spelling.size(); ++I) {
       // This line is done.
       if (isVerticalWhitespace(Spelling[I]) || I == Spelling.size()) {
-        llvm::SmallVector<TextDiagnostic::StyleRange> &LineRanges =
+        SmallVector<TextDiagnostic::StyleRange> &LineRanges =
             SnippetRanges[L - StartLineNumber];
 
         if (L >= StartLineNumber) {
@@ -1328,7 +1328,7 @@ void TextDiagnostic::emitSnippetAndCaret(
       prepareAndFilterRanges(Ranges, SM, Lines, FID, LangOpts);
 
   // Prepare source highlighting information for the lines we're about to emit.
-  std::unique_ptr<llvm::SmallVector<StyleRange>[]> SourceStyles =
+  std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
       highlightLines(BufStart, Lines.first, Lines.second, PP, LangOpts,
                      DiagOpts->MaxHighlightFileSize, FID, SM);
 

>From 0bcc155b380759d5b077f879d72898ded76d6427 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 14 Nov 2023 10:52:19 +0100
Subject: [PATCH 32/35] Don't stop highlighting on unprintable characters

---
 clang/lib/Frontend/TextDiagnostic.cpp | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index d3891c9bc4e86d..30b9968e2e87e8 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1443,11 +1443,18 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
     auto [Str, WasPrintable] =
         printableTextForNextCharacter(SourceLine, &I, DiagOpts->TabStop);
 
-    // Just stop highlighting anything for this line if we found a non-printable
-    // character.
-    if (!WasPrintable)
-      HighlightingEnabled = false;
+    // Toggle inverted colors on or off for this character.
+    if (DiagOpts->ShowColors) {
+      if (WasPrintable == PrintReversed) {
+        PrintReversed = !PrintReversed;
+        if (PrintReversed)
+          OS.reverseColor();
+        else
+          OS.resetColor();
+      }
+    }
 
+    // Apply syntax highlighting information if requested.
     if (HighlightingEnabled) {
       const auto *CharStyle = llvm::find_if(Styles, [I](const StyleRange &R) {
         return (R.Start < I && R.End >= I);
@@ -1459,16 +1466,6 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
         OS.resetColor();
     }
 
-    // Toggle inverted colors on or off for this character.
-    if (DiagOpts->ShowColors) {
-      if (WasPrintable == PrintReversed) {
-        PrintReversed = !PrintReversed;
-        if (PrintReversed)
-          OS.reverseColor();
-        else
-          OS.resetColor();
-      }
-    }
     OS << Str;
   }
 

>From e695754c51a888634b8679fab56c54eac0248281 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 14 Nov 2023 11:05:36 +0100
Subject: [PATCH 33/35] Minimize llvm::raw_ostream color changes

---
 clang/lib/Frontend/TextDiagnostic.cpp | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 30b9968e2e87e8..a098c247daf279 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1437,7 +1437,7 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
 
   // Print the source line one character at a time.
   bool PrintReversed = false;
-  bool HighlightingEnabled = DiagOpts->ShowColors;
+  std::optional<llvm::raw_ostream::Colors> CurrentColor;
   size_t I = 0;
   while (I < SourceLine.size()) {
     auto [Str, WasPrintable] =
@@ -1449,21 +1449,27 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
         PrintReversed = !PrintReversed;
         if (PrintReversed)
           OS.reverseColor();
-        else
+        else {
           OS.resetColor();
+          CurrentColor = std::nullopt;
+        }
       }
-    }
 
-    // Apply syntax highlighting information if requested.
-    if (HighlightingEnabled) {
+      // Apply syntax highlighting information.
       const auto *CharStyle = llvm::find_if(Styles, [I](const StyleRange &R) {
         return (R.Start < I && R.End >= I);
       });
 
-      if (CharStyle != Styles.end())
-        OS.changeColor(CharStyle->Color, false);
-      else
+      if (CharStyle != Styles.end()) {
+        if (!CurrentColor ||
+            (CurrentColor && *CurrentColor != CharStyle->Color)) {
+          OS.changeColor(CharStyle->Color, false);
+          CurrentColor = CharStyle->Color;
+        }
+      } else if (CurrentColor) {
         OS.resetColor();
+        CurrentColor = std::nullopt;
+      }
     }
 
     OS << Str;

>From f01043890c403c7a2462518148e687731a2615fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 16 Jan 2024 16:18:04 +0100
Subject: [PATCH 34/35] Re-introduce pp check points

---
 clang/include/clang/Lex/Preprocessor.h |  6 +++++
 clang/lib/Frontend/TextDiagnostic.cpp  | 26 ++++++++++++--------
 clang/lib/Lex/Preprocessor.cpp         | 33 ++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4ec21a8b6be2c8..e58092849deedb 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -284,6 +284,8 @@ class Preprocessor {
   /// The kind of translation unit we are processing.
   const TranslationUnitKind TUKind;
 
+  const char *getCheckPoint(FileID FID, const char *Start) const;
+
 private:
   /// The code-completion handler.
   CodeCompletionHandler *CodeComplete = nullptr;
@@ -311,6 +313,10 @@ class Preprocessor {
   /// The import path for named module that we're currently processing.
   SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> NamedModuleImportPath;
 
+  llvm::DenseMap<FileID, SmallVector<const char *>> CheckPoints;
+  unsigned CheckPointCounter = 0;
+  void saveCheckPoint();
+
   /// Whether the import is an `@import` or a standard c++ modules import.
   bool IsAtImport = false;
 
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index a098c247daf279..4cdafca4e2601d 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -1128,11 +1128,11 @@ prepareAndFilterRanges(const SmallVectorImpl<CharSourceRange> &Ranges,
 /// corresponds to syntax highlighting information in one line. In each line,
 /// the StyleRanges are non-overlapping and sorted from start to end of the
 /// line.
-std::unique_ptr<llvm::SmallVector<TextDiagnostic::StyleRange>[]>
+static std::unique_ptr<llvm::SmallVector<TextDiagnostic::StyleRange>[]>
 highlightLines(StringRef FileData, unsigned StartLineNumber,
                unsigned EndLineNumber, const Preprocessor *PP,
                const LangOptions &LangOpts, uint32_t MaxHighlightFileSize,
-               FileID FID, const SourceManager &SM) {
+               FileID FID, const SourceManager &SM, const char *LineStart) {
   assert(StartLineNumber <= EndLineNumber);
   auto SnippetRanges =
       std::make_unique<SmallVector<TextDiagnostic::StyleRange>[]>(
@@ -1146,12 +1146,17 @@ highlightLines(StringRef FileData, unsigned StartLineNumber,
     return SnippetRanges;
 
   auto Buff = llvm::MemoryBuffer::getMemBuffer(FileData);
-  if (Buff->getBufferSize() > MaxHighlightFileSize)
-    return SnippetRanges;
-
   Lexer L{FID, *Buff, SM, LangOpts};
   L.SetKeepWhitespaceMode(true);
 
+  if (const char *CheckPoint = PP->getCheckPoint(FID, LineStart)) {
+    assert(CheckPoint >= Buff->getBufferStart() &&
+           CheckPoint <= Buff->getBufferEnd());
+    assert(CheckPoint <= LineStart);
+    size_t Offset = CheckPoint - Buff->getBufferStart();
+    L.seek(Offset, /*IsAtStartOfLine=*/false);
+  }
+
   // Classify the given token and append it to the given vector.
   auto appendStyle =
       [PP, &LangOpts](SmallVector<TextDiagnostic::StyleRange> &Vec,
@@ -1327,11 +1332,6 @@ void TextDiagnostic::emitSnippetAndCaret(
   SmallVector<LineRange> LineRanges =
       prepareAndFilterRanges(Ranges, SM, Lines, FID, LangOpts);
 
-  // Prepare source highlighting information for the lines we're about to emit.
-  std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
-      highlightLines(BufStart, Lines.first, Lines.second, PP, LangOpts,
-                     DiagOpts->MaxHighlightFileSize, FID, SM);
-
   for (unsigned LineNo = Lines.first; LineNo != Lines.second + 1;
        ++LineNo, ++DisplayLineNo) {
     // Rewind from the current position to the start of the line.
@@ -1351,6 +1351,12 @@ void TextDiagnostic::emitSnippetAndCaret(
     if (size_t(LineEnd - LineStart) > MaxLineLengthToPrint)
       return;
 
+    // Prepare source highlighting information for the lines we're about to
+    // emit.
+    std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
+        highlightLines(BufStart, Lines.first, Lines.second, PP, LangOpts,
+                       DiagOpts->MaxHighlightFileSize, FID, SM, LineStart);
+
     // Copy the line of code into an std::string for ease of manipulation.
     std::string SourceLine(LineStart, LineEnd);
     // Remove trailing null bytes.
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 64f54c6fc6382f..c9f83517276da6 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -953,6 +953,7 @@ void Preprocessor::Lex(Token &Result) {
       break;
     }
   }
+  saveCheckPoint();
 
   LastTokenWasAt = Result.is(tok::at);
   --LexLevel;
@@ -1553,3 +1554,35 @@ void Preprocessor::createPreprocessingRecord() {
   Record = new PreprocessingRecord(getSourceManager());
   addPPCallbacks(std::unique_ptr<PPCallbacks>(Record));
 }
+
+void Preprocessor::saveCheckPoint() {
+
+  if (!CurLexer)
+    return;
+
+  if (CheckPointCounter < 1024) {
+    ++CheckPointCounter;
+    return;
+  }
+
+  auto FID = CurLexer->getFileID();
+  auto P = CurLexer->BufferPtr;
+
+  CheckPoints[FID].push_back(P);
+  CheckPointCounter = 0;
+}
+
+const char *Preprocessor::getCheckPoint(FileID FID, const char *Start) const {
+  if (auto It = CheckPoints.find(FID); It != CheckPoints.end()) {
+    const SmallVector<const char *> &FileCheckPoints = It->second;
+    const char *Last = nullptr;
+    // FIXME: Do better than a linear search.
+    for (const char *P : FileCheckPoints) {
+      if (P > Start)
+        return Last;
+      Last = P;
+    }
+  }
+
+  return nullptr;
+}

>From 6eabc6997f0bfc0aba6466ad6037ca973905eb6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Tue, 16 Jan 2024 16:59:05 +0100
Subject: [PATCH 35/35] Pull preconditions to saveCheckPoint() call site

---
 clang/lib/Lex/Preprocessor.cpp | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index c9f83517276da6..4abb916625a899 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -953,7 +953,9 @@ void Preprocessor::Lex(Token &Result) {
       break;
     }
   }
-  saveCheckPoint();
+
+  if (CurLexer && ++CheckPointCounter == 1024)
+    saveCheckPoint();
 
   LastTokenWasAt = Result.is(tok::at);
   --LexLevel;
@@ -1556,15 +1558,6 @@ void Preprocessor::createPreprocessingRecord() {
 }
 
 void Preprocessor::saveCheckPoint() {
-
-  if (!CurLexer)
-    return;
-
-  if (CheckPointCounter < 1024) {
-    ++CheckPointCounter;
-    return;
-  }
-
   auto FID = CurLexer->getFileID();
   auto P = CurLexer->BufferPtr;
 



More information about the cfe-commits mailing list