[clang] [clang][Diagnostics] Highlight code snippets (PR #66514)

Timm Baeder via cfe-commits cfe-commits at lists.llvm.org
Thu Jan 25 21:20:37 PST 2024


https://github.com/tbaederr updated https://github.com/llvm/llvm-project/pull/66514

>From 29c174e595dd839cf96ca54778712ab3bc543103 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder at redhat.com>
Date: Fri, 15 Sep 2023 15:51:39 +0200
Subject: [PATCH] [clang][Diagnostics] Highlight code snippets

Add some primitive syntax highlighting to our code snippet output.
---
 clang/docs/ReleaseNotes.rst                   |   2 +
 clang/include/clang/Frontend/TextDiagnostic.h |  18 +-
 clang/include/clang/Lex/Preprocessor.h        |  10 +
 clang/lib/Frontend/TextDiagnostic.cpp         | 211 +++++++++++++++++-
 clang/lib/Frontend/TextDiagnosticPrinter.cpp  |   2 +-
 clang/lib/Lex/Preprocessor.cpp                |  24 ++
 clang/test/Frontend/diagnostic-pipe.c         |   9 +
 7 files changed, 262 insertions(+), 14 deletions(-)
 create mode 100644 clang/test/Frontend/diagnostic-pipe.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index db3d74e124e7d1d..6f5fe2049e242fc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -97,6 +97,8 @@ Attribute Changes in Clang
 
 Improvements to Clang's diagnostics
 -----------------------------------
+- Clang now applies syntax highlighting to the code snippets it
+  prints.
 
 Improvements to Clang's time-trace
 ----------------------------------
diff --git a/clang/include/clang/Frontend/TextDiagnostic.h b/clang/include/clang/Frontend/TextDiagnostic.h
index 7eb0ab0cdc9bca8..a2fe8ae995423b9 100644
--- a/clang/include/clang/Frontend/TextDiagnostic.h
+++ b/clang/include/clang/Frontend/TextDiagnostic.h
@@ -16,6 +16,7 @@
 #define LLVM_CLANG_FRONTEND_TEXTDIAGNOSTIC_H
 
 #include "clang/Frontend/DiagnosticRenderer.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace clang {
 
@@ -33,14 +34,22 @@ namespace clang {
 /// printing coming out of libclang.
 class TextDiagnostic : public DiagnosticRenderer {
   raw_ostream &OS;
+  const Preprocessor *PP;
 
 public:
-  TextDiagnostic(raw_ostream &OS,
-                 const LangOptions &LangOpts,
-                 DiagnosticOptions *DiagOpts);
+  TextDiagnostic(raw_ostream &OS, const LangOptions &LangOpts,
+                 DiagnosticOptions *DiagOpts, const Preprocessor *PP = nullptr);
 
   ~TextDiagnostic() override;
 
+  struct StyleRange {
+    unsigned Start;
+    unsigned End;
+    enum llvm::raw_ostream::Colors Color;
+    StyleRange(unsigned S, unsigned E, enum llvm::raw_ostream::Colors C)
+        : Start(S), End(E), Color(C){};
+  };
+
   /// Print the diagonstic level to a raw_ostream.
   ///
   /// This is a static helper that handles colorizing the level and formatting
@@ -104,7 +113,8 @@ class TextDiagnostic : public DiagnosticRenderer {
                            ArrayRef<FixItHint> Hints);
 
   void emitSnippet(StringRef SourceLine, unsigned MaxLineNoDisplayWidth,
-                   unsigned LineNo);
+                   unsigned LineNo, unsigned DisplayLineNo,
+                   ArrayRef<StyleRange> Styles);
 
   void emitParseableFixits(ArrayRef<FixItHint> Hints, const SourceManager &SM);
 };
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 2d9c53cdf5bde8e..9d0d53129a12dd9 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -284,6 +284,13 @@ class Preprocessor {
   /// The kind of translation unit we are processing.
   const TranslationUnitKind TUKind;
 
+  /// Returns a pointer into the given file's buffer that's guaranteed
+  /// to be between tokens. The returned pointer is always before \p Start.
+  /// The maximum distance betweenthe returned pointer and \p Start is
+  /// limited by a constant value, but also an implementation detail.
+  /// If no such check point exists, \c nullptr is returned.
+  const char *getCheckPoint(FileID FID, const char *Start) const;
+
 private:
   /// The code-completion handler.
   CodeCompletionHandler *CodeComplete = nullptr;
@@ -311,6 +318,9 @@ class Preprocessor {
   /// The import path for named module that we're currently processing.
   SmallVector<std::pair<IdentifierInfo *, SourceLocation>, 2> NamedModuleImportPath;
 
+  llvm::DenseMap<FileID, SmallVector<const char *>> CheckPoints;
+  unsigned CheckPointCounter = 0;
+
   /// Whether the import is an `@import` or a standard c++ modules import.
   bool IsAtImport = false;
 
diff --git a/clang/lib/Frontend/TextDiagnostic.cpp b/clang/lib/Frontend/TextDiagnostic.cpp
index 779dead5d058d1a..291d71f6db61f17 100644
--- a/clang/lib/Frontend/TextDiagnostic.cpp
+++ b/clang/lib/Frontend/TextDiagnostic.cpp
@@ -12,6 +12,7 @@
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Lex/Lexer.h"
+#include "clang/Lex/Preprocessor.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ConvertUTF.h"
@@ -41,6 +42,14 @@ static const enum raw_ostream::Colors fatalColor = raw_ostream::RED;
 static const enum raw_ostream::Colors savedColor =
   raw_ostream::SAVEDCOLOR;
 
+// Magenta is taken for 'warning'. Red is already 'error' and 'cyan'
+// is already taken for 'note'. Green is already used to underline
+// source ranges. White and black are bad because of the usual
+// terminal backgrounds. Which leaves us only with TWO options.
+static constexpr raw_ostream::Colors CommentColor = raw_ostream::YELLOW;
+static constexpr raw_ostream::Colors LiteralColor = raw_ostream::GREEN;
+static constexpr raw_ostream::Colors KeywordColor = raw_ostream::BLUE;
+
 /// Add highlights to differences in template strings.
 static void applyTemplateHighlighting(raw_ostream &OS, StringRef Str,
                                       bool &Normal, bool Bold) {
@@ -644,10 +653,10 @@ static bool printWordWrapped(raw_ostream &OS, StringRef Str, unsigned Columns,
   return Wrapped;
 }
 
-TextDiagnostic::TextDiagnostic(raw_ostream &OS,
-                               const LangOptions &LangOpts,
-                               DiagnosticOptions *DiagOpts)
-  : DiagnosticRenderer(LangOpts, DiagOpts), OS(OS) {}
+TextDiagnostic::TextDiagnostic(raw_ostream &OS, const LangOptions &LangOpts,
+                               DiagnosticOptions *DiagOpts,
+                               const Preprocessor *PP)
+    : DiagnosticRenderer(LangOpts, DiagOpts), OS(OS), PP(PP) {}
 
 TextDiagnostic::~TextDiagnostic() {}
 
@@ -1112,6 +1121,162 @@ prepareAndFilterRanges(const SmallVectorImpl<CharSourceRange> &Ranges,
   return LineRanges;
 }
 
+/// Creates syntax highlighting information in form of StyleRanges.
+///
+/// The returned unique ptr has always exactly size
+/// (\p EndLineNumber - \p StartLineNumber + 1). Each SmallVector in there
+/// corresponds to syntax highlighting information in one line. In each line,
+/// the StyleRanges are non-overlapping and sorted from start to end of the
+/// line.
+static std::unique_ptr<llvm::SmallVector<TextDiagnostic::StyleRange>[]>
+highlightLines(StringRef FileData, unsigned StartLineNumber,
+               unsigned EndLineNumber, const Preprocessor *PP,
+               const LangOptions &LangOpts, bool ShowColors, FileID FID,
+               const SourceManager &SM) {
+  assert(StartLineNumber <= EndLineNumber);
+  auto SnippetRanges =
+      std::make_unique<SmallVector<TextDiagnostic::StyleRange>[]>(
+          EndLineNumber - StartLineNumber + 1);
+
+  if (!PP || !ShowColors)
+    return SnippetRanges;
+
+  // Might cause emission of another diagnostic.
+  if (PP->getIdentifierTable().getExternalIdentifierLookup())
+    return SnippetRanges;
+
+  auto Buff = llvm::MemoryBuffer::getMemBuffer(FileData);
+  Lexer L{FID, *Buff, SM, LangOpts};
+  L.SetKeepWhitespaceMode(true);
+
+  const char *FirstLineStart =
+      FileData.data() +
+      SM.getDecomposedLoc(SM.translateLineCol(FID, StartLineNumber, 1)).second;
+  if (const char *CheckPoint = PP->getCheckPoint(FID, FirstLineStart)) {
+    assert(CheckPoint >= Buff->getBufferStart() &&
+           CheckPoint <= Buff->getBufferEnd());
+    assert(CheckPoint <= FirstLineStart);
+    size_t Offset = CheckPoint - Buff->getBufferStart();
+    L.seek(Offset, /*IsAtStartOfLine=*/false);
+  }
+
+  // Classify the given token and append it to the given vector.
+  auto appendStyle =
+      [PP, &LangOpts](SmallVector<TextDiagnostic::StyleRange> &Vec,
+                      const Token &T, unsigned Start, unsigned Length) -> void {
+    if (T.is(tok::raw_identifier)) {
+      StringRef RawIdent = T.getRawIdentifier();
+      // Special case true/false/nullptr/... literals, since they will otherwise
+      // be treated as keywords.
+      // FIXME: It would be good to have a programmatic way of getting this
+      // list.
+      if (llvm::StringSwitch<bool>(RawIdent)
+              .Case("true", true)
+              .Case("false", true)
+              .Case("nullptr", true)
+              .Case("__func__", true)
+              .Case("__objc_yes__", true)
+              .Case("__objc_no__", true)
+              .Case("__null", true)
+              .Case("__FUNCDNAME__", true)
+              .Case("__FUNCSIG__", true)
+              .Case("__FUNCTION__", true)
+              .Case("__FUNCSIG__", true)
+              .Default(false)) {
+        Vec.emplace_back(Start, Start + Length, LiteralColor);
+      } else {
+        const IdentifierInfo *II = PP->getIdentifierInfo(RawIdent);
+        assert(II);
+        if (II->isKeyword(LangOpts))
+          Vec.emplace_back(Start, Start + Length, KeywordColor);
+      }
+    } else if (tok::isLiteral(T.getKind())) {
+      Vec.emplace_back(Start, Start + Length, LiteralColor);
+    } else {
+      assert(T.is(tok::comment));
+      Vec.emplace_back(Start, Start + Length, CommentColor);
+    }
+  };
+
+  bool Stop = false;
+  while (!Stop) {
+    Token T;
+    Stop = L.LexFromRawLexer(T);
+    if (T.is(tok::unknown))
+      continue;
+
+    // We are only interested in identifiers, literals and comments.
+    if (!T.is(tok::raw_identifier) && !T.is(tok::comment) &&
+        !tok::isLiteral(T.getKind()))
+      continue;
+
+    bool Invalid = false;
+    unsigned TokenEndLine = SM.getSpellingLineNumber(T.getEndLoc(), &Invalid);
+    if (Invalid || TokenEndLine < StartLineNumber)
+      continue;
+
+    assert(TokenEndLine >= StartLineNumber);
+
+    unsigned TokenStartLine =
+        SM.getSpellingLineNumber(T.getLocation(), &Invalid);
+    if (Invalid)
+      continue;
+    // If this happens, we're done.
+    if (TokenStartLine > EndLineNumber)
+      break;
+
+    unsigned StartCol =
+        SM.getSpellingColumnNumber(T.getLocation(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+
+    // Simple tokens.
+    if (TokenStartLine == TokenEndLine) {
+      SmallVector<TextDiagnostic::StyleRange> &LineRanges =
+          SnippetRanges[TokenStartLine - StartLineNumber];
+      appendStyle(LineRanges, T, StartCol, T.getLength());
+      continue;
+    }
+    assert((TokenEndLine - TokenStartLine) >= 1);
+
+    // For tokens that span multiple lines (think multiline comments), we
+    // divide them into multiple StyleRanges.
+    unsigned EndCol = SM.getSpellingColumnNumber(T.getEndLoc(), &Invalid) - 1;
+    if (Invalid)
+      continue;
+
+    std::string Spelling = Lexer::getSpelling(T, SM, LangOpts);
+
+    unsigned L = TokenStartLine;
+    unsigned LineLength = 0;
+    for (unsigned I = 0; I <= Spelling.size(); ++I) {
+      // This line is done.
+      if (isVerticalWhitespace(Spelling[I]) || I == Spelling.size()) {
+        SmallVector<TextDiagnostic::StyleRange> &LineRanges =
+            SnippetRanges[L - StartLineNumber];
+
+        if (L >= StartLineNumber) {
+          if (L == TokenStartLine) // First line
+            appendStyle(LineRanges, T, StartCol, LineLength);
+          else if (L == TokenEndLine) // Last line
+            appendStyle(LineRanges, T, 0, EndCol);
+          else
+            appendStyle(LineRanges, T, 0, LineLength);
+        }
+
+        ++L;
+        if (L > EndLineNumber)
+          break;
+        LineLength = 0;
+        continue;
+      }
+      ++LineLength;
+    }
+  }
+
+  return SnippetRanges;
+}
+
 /// Emit a code snippet and caret line.
 ///
 /// This routine emits a single line's code snippet and caret line..
@@ -1181,6 +1346,12 @@ void TextDiagnostic::emitSnippetAndCaret(
       OS.indent(MaxLineNoDisplayWidth + 2) << "| ";
   };
 
+  // Prepare source highlighting information for the lines we're about to
+  // emit, starting from the first line.
+  std::unique_ptr<SmallVector<StyleRange>[]> SourceStyles =
+      highlightLines(BufStart, Lines.first, Lines.second, PP, LangOpts,
+                     DiagOpts->ShowColors, FID, SM);
+
   SmallVector<LineRange> LineRanges =
       prepareAndFilterRanges(Ranges, SM, Lines, FID, LangOpts);
 
@@ -1247,7 +1418,8 @@ void TextDiagnostic::emitSnippetAndCaret(
     }
 
     // Emit what we have computed.
-    emitSnippet(SourceLine, MaxLineNoDisplayWidth, DisplayLineNo);
+    emitSnippet(SourceLine, MaxLineNoDisplayWidth, LineNo, DisplayLineNo,
+                SourceStyles[LineNo - Lines.first]);
 
     if (!CaretLine.empty()) {
       indentForLineNumbers();
@@ -1277,16 +1449,18 @@ void TextDiagnostic::emitSnippetAndCaret(
 
 void TextDiagnostic::emitSnippet(StringRef SourceLine,
                                  unsigned MaxLineNoDisplayWidth,
-                                 unsigned LineNo) {
+                                 unsigned LineNo, unsigned DisplayLineNo,
+                                 ArrayRef<StyleRange> Styles) {
   // Emit line number.
   if (MaxLineNoDisplayWidth > 0) {
-    unsigned LineNoDisplayWidth = getNumDisplayWidth(LineNo);
+    unsigned LineNoDisplayWidth = getNumDisplayWidth(DisplayLineNo);
     OS.indent(MaxLineNoDisplayWidth - LineNoDisplayWidth + 1)
-        << LineNo << " | ";
+        << DisplayLineNo << " | ";
   }
 
   // Print the source line one character at a time.
   bool PrintReversed = false;
+  std::optional<llvm::raw_ostream::Colors> CurrentColor;
   size_t I = 0;
   while (I < SourceLine.size()) {
     auto [Str, WasPrintable] =
@@ -1298,10 +1472,29 @@ void TextDiagnostic::emitSnippet(StringRef SourceLine,
         PrintReversed = !PrintReversed;
         if (PrintReversed)
           OS.reverseColor();
-        else
+        else {
           OS.resetColor();
+          CurrentColor = std::nullopt;
+        }
+      }
+
+      // Apply syntax highlighting information.
+      const auto *CharStyle = llvm::find_if(Styles, [I](const StyleRange &R) {
+        return (R.Start < I && R.End >= I);
+      });
+
+      if (CharStyle != Styles.end()) {
+        if (!CurrentColor ||
+            (CurrentColor && *CurrentColor != CharStyle->Color)) {
+          OS.changeColor(CharStyle->Color, false);
+          CurrentColor = CharStyle->Color;
+        }
+      } else if (CurrentColor) {
+        OS.resetColor();
+        CurrentColor = std::nullopt;
       }
     }
+
     OS << Str;
   }
 
diff --git a/clang/lib/Frontend/TextDiagnosticPrinter.cpp b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
index 0ff5376098ffe8d..b2fb762537573ef 100644
--- a/clang/lib/Frontend/TextDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/TextDiagnosticPrinter.cpp
@@ -36,7 +36,7 @@ TextDiagnosticPrinter::~TextDiagnosticPrinter() {
 void TextDiagnosticPrinter::BeginSourceFile(const LangOptions &LO,
                                             const Preprocessor *PP) {
   // Build the TextDiagnostic utility.
-  TextDiag.reset(new TextDiagnostic(OS, LO, &*DiagOpts));
+  TextDiag.reset(new TextDiagnostic(OS, LO, &*DiagOpts, PP));
 }
 
 void TextDiagnosticPrinter::EndSourceFile() {
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 7fdb5d4c0d7b82f..031ed1e16bb8fce 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -72,6 +72,9 @@
 
 using namespace clang;
 
+/// Minimum distance between two check points, in tokens.
+static constexpr unsigned CheckPointStepSize = 1024;
+
 LLVM_INSTANTIATE_REGISTRY(PragmaHandlerRegistry)
 
 ExternalPreprocessorSource::~ExternalPreprocessorSource() = default;
@@ -954,6 +957,11 @@ void Preprocessor::Lex(Token &Result) {
     }
   }
 
+  if (CurLexer && ++CheckPointCounter == CheckPointStepSize) {
+    CheckPoints[CurLexer->getFileID()].push_back(CurLexer->BufferPtr);
+    CheckPointCounter = 0;
+  }
+
   LastTokenWasAt = Result.is(tok::at);
   --LexLevel;
 
@@ -1558,3 +1566,19 @@ void Preprocessor::createPreprocessingRecord() {
   Record = new PreprocessingRecord(getSourceManager());
   addPPCallbacks(std::unique_ptr<PPCallbacks>(Record));
 }
+
+const char *Preprocessor::getCheckPoint(FileID FID, const char *Start) const {
+  if (auto It = CheckPoints.find(FID); It != CheckPoints.end()) {
+    const SmallVector<const char *> &FileCheckPoints = It->second;
+    const char *Last = nullptr;
+    // FIXME: Do better than a linear search.
+    for (const char *P : FileCheckPoints) {
+      if (P > Start)
+        break;
+      Last = P;
+    }
+    return Last;
+  }
+
+  return nullptr;
+}
diff --git a/clang/test/Frontend/diagnostic-pipe.c b/clang/test/Frontend/diagnostic-pipe.c
new file mode 100644
index 000000000000000..61dbdcfb2e0e7c6
--- /dev/null
+++ b/clang/test/Frontend/diagnostic-pipe.c
@@ -0,0 +1,9 @@
+
+_Static_assert(0, "");
+
+/// Test that piping the output into another process disables syntax
+/// highlighting of code snippets.
+
+// RUN: not %clang_cc1 %s -o /dev/null 2>&1 | FileCheck %s
+// CHECK: error: static assertion failed:
+// CHECK-NEXT: {{^}}   2 | _Static_assert(0, "");{{$}}



More information about the cfe-commits mailing list