[clang] Implement `-dump-minimization-hints` flag. (PR #133910)

Wed Apr 9 08:53:22 PDT 2025

================
@@ -49,6 +54,185 @@ LLVM_INSTANTIATE_REGISTRY(FrontendPluginRegistry)
 
 namespace {
 
+/// DeserializedDeclsLineRangePrinter dumps ranges of deserialized declarations
+/// to aid debugging and bug minimization. It implements ASTConsumer and
+/// ASTDeserializationListener, so that an object of
+/// DeserializedDeclsLineRangePrinter registers as its own listener. The
+/// ASTDeserializationListener interface provides the DeclRead callback that we
+/// use to collect the deserialized Decls. Note that printing or otherwise
+/// processing them as this point is dangerous, since that could trigger
+/// additional deserialization and crash compilation. Therefore, we process the
+/// collected Decls in HandleTranslationUnit method of ASTConsumer. This is a
+/// safe point, since we know that by this point all the Decls needed by the
+/// compiler frontend have been deserialized. In case our processing causes
+/// further deserialization, DeclRead from the listener might be called again.
+/// However, at that point we don't accept any more Decls for processing.
+class DeserializedDeclsLineRangePrinter : public ASTConsumer,
+                                          ASTDeserializationListener {
+public:
+  explicit DeserializedDeclsLineRangePrinter(
+      SourceManager &SM, std::unique_ptr<llvm::raw_fd_ostream> OS)
+      : ASTDeserializationListener(), SM(SM), OS(std::move(OS)) {}
+
+  ASTDeserializationListener *GetASTDeserializationListener() override {
+    return this;
+  }
+
+  void DeclRead(GlobalDeclID ID, const Decl *D) override {
+    if (!IsCollectingDecls)
+      return;
+    if (!D || isa<TranslationUnitDecl>(D) || isa<LinkageSpecDecl>(D) ||
+        isa<NamespaceDecl>(D)) {
+      // These decls cover a lot of nested declarations that might not be used,
+      // reducing the granularity and making the output less useful.
+      return;
+    }
+    if (auto *DC = D->getDeclContext(); !DC || !DC->isFileContext()) {
+      // We choose to work at namespace level to reduce complexity and the
+      // number of cases we care about.
+      return;
+    }
+    PendingDecls.push_back(D);
+  }
+
+  struct Position {
+    unsigned Line;
+    unsigned Column;
+
+    bool operator<(const Position &other) const {
+      if (Line < other.Line)
+        return true;
+      if (Line > other.Line)
+        return false;
+      return Column < other.Column;
+    }
+
+    static Position GetBeginSpelling(const SourceManager &SM,
+                                     const CharSourceRange &R) {
+      SourceLocation Begin = R.getBegin();
+      return {SM.getSpellingLineNumber(Begin),
+              SM.getSpellingColumnNumber(Begin)};
+    }
+
+    static Position GetEndSpelling(const SourceManager &SM,
+                                   const CharSourceRange &Range,
+                                   const LangOptions &LangOpts) {
+      // For token ranges, compute end location for end character of the range.
+      // The end location of returned range is exclusive.
+      CharSourceRange R = Lexer::getAsCharRange(Range, SM, LangOpts);
+      SourceLocation End = R.getEnd();
+      // Relex the token past the end location of the last token in the source
+      // range. If it's a semicolon, advance the location by one token.
+      Token PossiblySemi;
+      Lexer::getRawToken(End, PossiblySemi, SM, LangOpts, true);
+      if (PossiblySemi.is(tok::semi))
+        End = End.getLocWithOffset(1);
+      return {SM.getSpellingLineNumber(End), SM.getSpellingColumnNumber(End)};
+    }
+  };
+
+  struct RequiredRanges {
+    StringRef Filename;
+    std::vector<std::pair<Position, Position>> FromTo;
+  };
+  void HandleTranslationUnit(ASTContext &Context) override {
+    assert(IsCollectingDecls && "HandleTranslationUnit called twice?");
+    IsCollectingDecls = false;
+
+    // Merge ranges in each of the files.
+    struct FileData {
+      std::vector<std::pair<Position, Position>> FromTo;
+      OptionalFileEntryRef Ref;
+    };
+    llvm::DenseMap<const FileEntry *, FileData> FileToLines;
+    for (const Decl *D : PendingDecls) {
+      CharSourceRange R = SM.getExpansionRange(D->getSourceRange());
+      if (!R.isValid())
+        continue;
+
+      auto *F = SM.getFileEntryForID(SM.getFileID(R.getBegin()));
+      if (F != SM.getFileEntryForID(SM.getFileID(R.getEnd()))) {
+        // Such cases are rare and difficult to handle.
+        continue;
+      }
+
+      auto &Data = FileToLines[F];
+      if (!Data.Ref)
+        Data.Ref = SM.getFileEntryRefForID(SM.getFileID(R.getBegin()));
+      Data.FromTo.push_back(
+          {Position::GetBeginSpelling(SM, R),
+           Position::GetEndSpelling(SM, R, D->getLangOpts())});
+    }
+
+    // To simplify output, merge consecutive and intersecting ranges.
+    std::vector<RequiredRanges> Result;
+    for (auto &[F, Data] : FileToLines) {
+      auto &FromTo = Data.FromTo;
+      assert(!FromTo.empty());
+
+      if (!Data.Ref)
+        continue;
+
+      llvm::sort(FromTo);
+
+      std::vector<std::pair<Position, Position>> MergedRanges;
+      MergedRanges.push_back(FromTo.front());
+      for (auto It = FromTo.begin() + 1; It < FromTo.end(); ++It) {
+        if (MergedRanges.back().second < It->first) {
+          MergedRanges.push_back(*It);
+          continue;
+        }
+        if (MergedRanges.back().second < It->second)
+          MergedRanges.back().second = It->second;
+      }
+      Result.push_back({Data.Ref->getName(), MergedRanges});
+    }
+    printJson(Result);
+  }
+
+private:
+  std::vector<const Decl *> PendingDecls;
+  bool IsCollectingDecls = true;
+  const SourceManager &SM;
+  std::unique_ptr<llvm::raw_ostream> OS;
+
+  void printJson(llvm::ArrayRef<RequiredRanges> Result) {
+    *OS << "{\n";
----------------
VitaNuo wrote:

I've run some tests, and the maximum I'm getting is 4.4Mb of JSON for a very large compilation. I think this should be palatable. For a smaller compilation, it's often <1Mb.

https://github.com/llvm/llvm-project/pull/133910