[clang] [analyzer][HTMLRewriter] Cache partial rewrite results. (PR #80220)

Artem Dergachev via cfe-commits cfe-commits at lists.llvm.org
Wed Jan 31 16:34:57 PST 2024


https://github.com/haoNoQ updated https://github.com/llvm/llvm-project/pull/80220

>From 13fbeb5411f1b877dc86dd6bebd3bf607c7fba7e Mon Sep 17 00:00:00 2001
From: Artem Dergachev <adergachev at apple.com>
Date: Tue, 30 Jan 2024 17:37:26 -0800
Subject: [PATCH 1/2] [analyzer][HTMLRewriter] Cache partial rewrite results.

This is a follow-up for 721dd3bc2 [analyzer] NFC: Don't regenerate duplicate
HTML reports.

Because HTMLRewriter re-runs the Lexer for syntax highlighting and
macro expansion purposes, it may get fairly expensive when the rewriter
is invoked multiple times on the same file. In the static analyzer
(which uses HTMLRewriter for HTML output mode) we only get away with this
because there are usually very few reports emitted per file.
But if loud checkers are enabled, such as `webkit.*`, this may explode
in complexity and even cause the compiler to run over the 32-bit
SourceLocation addressing limit.

This patch caches intermediate results so that re-lexing only needed
to happen once.

As the clever __COUNTER__ test demonstrates, "once" is still too many.
Ideally we shouldn't re-lex anything at all, which remains a TODO.

rdar://120801986
---
 .../include/clang/Rewrite/Core/HTMLRewrite.h  |  16 ++-
 clang/lib/Rewrite/HTMLRewrite.cpp             | 135 +++++++++++++++---
 .../StaticAnalyzer/Core/HTMLDiagnostics.cpp   |  10 +-
 .../test/Analysis/html_diagnostics/counter.c  |  29 ++++
 4 files changed, 164 insertions(+), 26 deletions(-)
 create mode 100644 clang/test/Analysis/html_diagnostics/counter.c

diff --git a/clang/include/clang/Rewrite/Core/HTMLRewrite.h b/clang/include/clang/Rewrite/Core/HTMLRewrite.h
index 340411e553479..e1252dc4e8272 100644
--- a/clang/include/clang/Rewrite/Core/HTMLRewrite.h
+++ b/clang/include/clang/Rewrite/Core/HTMLRewrite.h
@@ -16,6 +16,7 @@
 
 #include "clang/Basic/SourceLocation.h"
 #include <string>
+#include <map>
 
 namespace clang {
 
@@ -24,6 +25,15 @@ class RewriteBuffer;
 class Preprocessor;
 
 namespace html {
+  struct RelexRewriteCache;
+  using RelexRewriteCacheRef = std::shared_ptr<RelexRewriteCache>;
+
+  /// If you need to rewrite the same file multiple times, you can instantiate
+  /// a RelexRewriteCache and refer functions such as SyntaxHighlight()
+  /// and HighlightMacros() to it so that to avoid re-lexing the file each time.
+  /// The cache may outlive the rewriter as long as cached FileIDs and source
+  /// locations continue to make sense for the translation unit as a whole.
+  RelexRewriteCacheRef instantiateRelexRewriteCache();
 
   /// HighlightRange - Highlight a range in the source code with the specified
   /// start/end tags.  B/E must be in the same file.  This ensures that
@@ -67,13 +77,15 @@ namespace html {
 
   /// SyntaxHighlight - Relex the specified FileID and annotate the HTML with
   /// information about keywords, comments, etc.
-  void SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP);
+  void SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP,
+                       RelexRewriteCacheRef Cache = nullptr);
 
   /// HighlightMacros - This uses the macro table state from the end of the
   /// file, to reexpand macros and insert (into the HTML) information about the
   /// macro expansions.  This won't be perfectly perfect, but it will be
   /// reasonably close.
-  void HighlightMacros(Rewriter &R, FileID FID, const Preprocessor &PP);
+  void HighlightMacros(Rewriter &R, FileID FID, const Preprocessor &PP,
+                       RelexRewriteCacheRef Cache = nullptr);
 
 } // end html namespace
 } // end clang namespace
diff --git a/clang/lib/Rewrite/HTMLRewrite.cpp b/clang/lib/Rewrite/HTMLRewrite.cpp
index 083a9c09297e1..50cdacfe33eae 100644
--- a/clang/lib/Rewrite/HTMLRewrite.cpp
+++ b/clang/lib/Rewrite/HTMLRewrite.cpp
@@ -21,7 +21,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <memory>
+
 using namespace clang;
+using namespace llvm;
+using namespace html;
 
 
 /// HighlightRange - Highlight a range in the source code with the specified
@@ -104,6 +107,32 @@ void html::HighlightRange(RewriteBuffer &RB, unsigned B, unsigned E,
   }
 }
 
+namespace clang::html {
+struct RelexRewriteCache {
+  // These structs mimic input arguments of HighlightRange().
+  struct Highlight {
+    SourceLocation B, E;
+    std::string StartTag, EndTag;
+    bool IsTokenRange;
+  };
+  struct RawHighlight {
+    unsigned B, E;
+    std::string StartTag, EndTag;
+  };
+
+  // SmallVector isn't appropriate because these vectors are almost never small.
+  using HighlightList = std::vector<Highlight>;
+  using RawHighlightList = std::vector<RawHighlight>;
+
+  DenseMap<FileID, RawHighlightList> SyntaxHighlights;
+  DenseMap<FileID, HighlightList> MacroHighlights;
+};
+} // namespace clang::html
+
+html::RelexRewriteCacheRef html::instantiateRelexRewriteCache() {
+  return std::make_shared<RelexRewriteCache>();
+}
+
 void html::EscapeText(Rewriter &R, FileID FID,
                       bool EscapeSpaces, bool ReplaceTabs) {
 
@@ -442,13 +471,18 @@ input.spoilerhider:checked + label + .spoiler{
 /// information about keywords, macro expansions etc.  This uses the macro
 /// table state from the end of the file, so it won't be perfectly perfect,
 /// but it will be reasonably close.
-void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
-  RewriteBuffer &RB = R.getEditBuffer(FID);
+static void SyntaxHighlightImpl(
+    Rewriter &R, FileID FID, const Preprocessor &PP,
+    llvm::function_ref<void(RewriteBuffer &, unsigned, unsigned, const char *,
+                            const char *, const char *)>
+        HighlightRangeCallback) {
 
+  RewriteBuffer &RB = R.getEditBuffer(FID);
   const SourceManager &SM = PP.getSourceManager();
   llvm::MemoryBufferRef FromFile = SM.getBufferOrFake(FID);
+  const char *BufferStart = FromFile.getBuffer().data();
+
   Lexer L(FID, FromFile, SM, PP.getLangOpts());
-  const char *BufferStart = L.getBuffer().data();
 
   // Inform the preprocessor that we want to retain comments as tokens, so we
   // can highlight them.
@@ -475,13 +509,13 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
 
       // If this is a pp-identifier, for a keyword, highlight it as such.
       if (Tok.isNot(tok::identifier))
-        HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
-                       "<span class='keyword'>", "</span>");
+        HighlightRangeCallback(RB, TokOffs, TokOffs + TokLen, BufferStart,
+                               "<span class='keyword'>", "</span>");
       break;
     }
     case tok::comment:
-      HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
-                     "<span class='comment'>", "</span>");
+      HighlightRangeCallback(RB, TokOffs, TokOffs + TokLen, BufferStart,
+                             "<span class='comment'>", "</span>");
       break;
     case tok::utf8_string_literal:
       // Chop off the u part of u8 prefix
@@ -498,8 +532,8 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
       [[fallthrough]];
     case tok::string_literal:
       // FIXME: Exclude the optional ud-suffix from the highlighted range.
-      HighlightRange(RB, TokOffs, TokOffs+TokLen, BufferStart,
-                     "<span class='string_literal'>", "</span>");
+      HighlightRangeCallback(RB, TokOffs, TokOffs + TokLen, BufferStart,
+                             "<span class='string_literal'>", "</span>");
       break;
     case tok::hash: {
       // If this is a preprocessor directive, all tokens to end of line are too.
@@ -516,8 +550,8 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
       }
 
       // Find end of line.  This is a hack.
-      HighlightRange(RB, TokOffs, TokEnd, BufferStart,
-                     "<span class='directive'>", "</span>");
+      HighlightRangeCallback(RB, TokOffs, TokEnd, BufferStart,
+                             "<span class='directive'>", "</span>");
 
       // Don't skip the next token.
       continue;
@@ -527,12 +561,43 @@ void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP) {
     L.LexFromRawLexer(Tok);
   }
 }
+void html::SyntaxHighlight(Rewriter &R, FileID FID, const Preprocessor &PP,
+                           RelexRewriteCacheRef Cache) {
+  RewriteBuffer &RB = R.getEditBuffer(FID);
+  const SourceManager &SM = PP.getSourceManager();
+  llvm::MemoryBufferRef FromFile = SM.getBufferOrFake(FID);
+  const char *BufferStart = FromFile.getBuffer().data();
+
+  if (Cache) {
+    auto CacheIt = Cache->SyntaxHighlights.find(FID);
+    if (CacheIt != Cache->SyntaxHighlights.end()) {
+      for (const RelexRewriteCache::RawHighlight &H : CacheIt->second) {
+        HighlightRange(RB, H.B, H.E, BufferStart, H.StartTag.data(),
+                       H.EndTag.data());
+      }
+      return;
+    }
+  }
+
+  // "Every time you would call HighlightRange, cache the inputs as well."
+  auto HighlightRangeCallback = [&](RewriteBuffer &RB, unsigned B, unsigned E,
+                                    const char *BufferStart,
+                                    const char *StartTag, const char *EndTag) {
+    HighlightRange(RB, B, E, BufferStart, StartTag, EndTag);
+
+    if (Cache)
+      Cache->SyntaxHighlights[FID].push_back({B, E, StartTag, EndTag});
+  };
+
+  SyntaxHighlightImpl(R, FID, PP, HighlightRangeCallback);
+}
+
+static void HighlightMacrosImpl(
+    Rewriter &R, FileID FID, const Preprocessor &PP,
+    llvm::function_ref<void(Rewriter &, SourceLocation, SourceLocation,
+                            const char *, const char *, bool)>
+        HighlightRangeCallback) {
 
-/// HighlightMacros - This uses the macro table state from the end of the
-/// file, to re-expand macros and insert (into the HTML) information about the
-/// macro expansions.  This won't be perfectly perfect, but it will be
-/// reasonably close.
-void html::HighlightMacros(Rewriter &R, FileID FID, const Preprocessor& PP) {
   // Re-lex the raw token stream into a token buffer.
   const SourceManager &SM = PP.getSourceManager();
   std::vector<Token> TokenStream;
@@ -659,11 +724,45 @@ void html::HighlightMacros(Rewriter &R, FileID FID, const Preprocessor& PP) {
     // get highlighted.
     Expansion = "<span class='macro_popup'>" + Expansion + "</span></span>";
 
-    HighlightRange(R, LLoc.getBegin(), LLoc.getEnd(), "<span class='macro'>",
-                   Expansion.c_str(), LLoc.isTokenRange());
+    HighlightRangeCallback(R, LLoc.getBegin(), LLoc.getEnd(),
+                           "<span class='macro'>", Expansion.c_str(),
+                           LLoc.isTokenRange());
   }
 
   // Restore the preprocessor's old state.
   TmpPP.setDiagnostics(*OldDiags);
   TmpPP.setPragmasEnabled(PragmasPreviouslyEnabled);
 }
+
+/// HighlightMacros - This uses the macro table state from the end of the
+/// file, to re-expand macros and insert (into the HTML) information about the
+/// macro expansions.  This won't be perfectly perfect, but it will be
+/// reasonably close.
+void html::HighlightMacros(Rewriter &R, FileID FID, const Preprocessor &PP,
+                           RelexRewriteCacheRef Cache) {
+  if (Cache) {
+    auto CacheIt = Cache->MacroHighlights.find(FID);
+    if (CacheIt != Cache->MacroHighlights.end()) {
+      for (const RelexRewriteCache::Highlight &H : CacheIt->second) {
+        HighlightRange(R, H.B, H.E, H.StartTag.data(), H.EndTag.data(),
+                       H.IsTokenRange);
+      }
+      return;
+    }
+  }
+
+  // "Every time you would call HighlightRange, cache the inputs as well."
+  auto HighlightRangeCallback = [&](Rewriter &R, SourceLocation B,
+                                    SourceLocation E, const char *StartTag,
+                                    const char *EndTag, bool isTokenRange) {
+    HighlightRange(R, B, E, StartTag,
+                   EndTag, isTokenRange);
+
+    if (Cache) {
+      Cache->MacroHighlights[FID].push_back(
+          {B, E, StartTag, EndTag, isTokenRange});
+    }
+  };
+
+  HighlightMacrosImpl(R, FID, PP, HighlightRangeCallback);
+}
diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index 86947b7929e9b..fb5030d373c2f 100644
--- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -69,6 +69,8 @@ class HTMLDiagnostics : public PathDiagnosticConsumer {
   const Preprocessor &PP;
   const bool SupportsCrossFileDiagnostics;
   llvm::StringSet<> EmittedHashes;
+  html::RelexRewriteCacheRef RewriterCache =
+      html::instantiateRelexRewriteCache();
 
 public:
   HTMLDiagnostics(PathDiagnosticConsumerOptions DiagOpts,
@@ -309,10 +311,6 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
     return;
   }
 
-  // FIXME: This causes each file to be re-parsed and syntax-highlighted
-  // and macro-expanded separately for each report. We could cache such rewrites
-  // across all reports and only re-do the part that's actually different:
-  // the warning/note bubbles.
   std::string report = GenerateHTML(D, R, SMgr, path, declName.c_str());
   if (report.empty()) {
     llvm::errs() << "warning: no diagnostics generated for main file.\n";
@@ -882,8 +880,8 @@ void HTMLDiagnostics::RewriteFile(Rewriter &R, const PathPieces &path,
   // If we have a preprocessor, relex the file and syntax highlight.
   // We might not have a preprocessor if we come from a deserialized AST file,
   // for example.
-  html::SyntaxHighlight(R, FID, PP);
-  html::HighlightMacros(R, FID, PP);
+  html::SyntaxHighlight(R, FID, PP, RewriterCache);
+  html::HighlightMacros(R, FID, PP, RewriterCache);
 }
 
 void HTMLDiagnostics::HandlePiece(Rewriter &R, FileID BugFileID,
diff --git a/clang/test/Analysis/html_diagnostics/counter.c b/clang/test/Analysis/html_diagnostics/counter.c
new file mode 100644
index 0000000000000..22f722424372f
--- /dev/null
+++ b/clang/test/Analysis/html_diagnostics/counter.c
@@ -0,0 +1,29 @@
+// RUN: rm -fR %t
+// RUN: mkdir %t
+// RUN: %clang_analyze_cc1 -analyzer-checker=core \
+// RUN:                    -analyzer-output=html -o %t -verify %s
+// RUN: grep -v CHECK %t/report-*.html | FileCheck %s
+
+
+void foo() {
+  int *x = 0;
+  *x = __COUNTER__; // expected-warning{{Dereference of null pointer (loaded from variable 'x')}}
+}
+
+void bar() {
+  int *y;
+  *y = __COUNTER__; // expected-warning{{Dereference of undefined pointer value (loaded from variable 'y')}}
+}
+
+// The checks below confirm that both reports have the same values for __COUNTER__.
+//
+// FIXME: The correct values are (0, 1, 0, 1). Because we re-lex the file in order
+// to detect macro expansions for HTML report purposes, they turn into (2, 3, 2, 3)
+// by the time we emit HTML. But at least it's better than (2, 3, 4, 5)
+// which would have been the case if we re-lexed the file *each* time we
+// emitted an HTML report.
+
+// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>2</span></span>
+// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>3</span></span>
+// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>2</span></span>
+// CHECK: <span class='macro'>__COUNTER__<span class='macro_popup'>3</span></span>

>From a16035f1f867ef2c9d600f6916085c40f9bbbd1b Mon Sep 17 00:00:00 2001
From: Artem Dergachev <adergachev at apple.com>
Date: Wed, 31 Jan 2024 16:34:33 -0800
Subject: [PATCH 2/2] Fix formatting, except the indent inside namespace html.

Which was already broken.
---
 clang/include/clang/Rewrite/Core/HTMLRewrite.h | 1 -
 clang/lib/Rewrite/HTMLRewrite.cpp              | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/clang/include/clang/Rewrite/Core/HTMLRewrite.h b/clang/include/clang/Rewrite/Core/HTMLRewrite.h
index e1252dc4e8272..eecf589632746 100644
--- a/clang/include/clang/Rewrite/Core/HTMLRewrite.h
+++ b/clang/include/clang/Rewrite/Core/HTMLRewrite.h
@@ -16,7 +16,6 @@
 
 #include "clang/Basic/SourceLocation.h"
 #include <string>
-#include <map>
 
 namespace clang {
 
diff --git a/clang/lib/Rewrite/HTMLRewrite.cpp b/clang/lib/Rewrite/HTMLRewrite.cpp
index 50cdacfe33eae..a96ca0764ae73 100644
--- a/clang/lib/Rewrite/HTMLRewrite.cpp
+++ b/clang/lib/Rewrite/HTMLRewrite.cpp
@@ -26,7 +26,6 @@ using namespace clang;
 using namespace llvm;
 using namespace html;
 
-
 /// HighlightRange - Highlight a range in the source code with the specified
 /// start/end tags.  B/E must be in the same file.  This ensures that
 /// start/end tags are placed at the start/end of each line if the range is
@@ -755,8 +754,7 @@ void html::HighlightMacros(Rewriter &R, FileID FID, const Preprocessor &PP,
   auto HighlightRangeCallback = [&](Rewriter &R, SourceLocation B,
                                     SourceLocation E, const char *StartTag,
                                     const char *EndTag, bool isTokenRange) {
-    HighlightRange(R, B, E, StartTag,
-                   EndTag, isTokenRange);
+    HighlightRange(R, B, E, StartTag, EndTag, isTokenRange);
 
     if (Cache) {
       Cache->MacroHighlights[FID].push_back(



More information about the cfe-commits mailing list