[clang] [clang][analyzer] Make per-entry-point metric rows uniquely identifiable (PR #161663)

via cfe-commits cfe-commits at lists.llvm.org
Thu Oct 2 06:23:06 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-clang

Author: Arseniy Zaostrovnykh (necto)

<details>
<summary>Changes</summary>

Also remove the gratuitoius spaces after "," that break strict CSV compliance.

As the debug function name does not uniquely identify an entry point, add the main-TU name and the USR values for each entry point snapshot to reduce the likelyhood of collisions between declarations across large projects.

While adding a filename to each row increases the file size substantially, the difference in size for the compressed is acceptable.

I evaluated it on our set of 200+ open source C and C++ projects with 3M entry points, and got the following results when adding these two columns:
- Raw CSV file increased from 530MB to 1.1GB
- Compressed file (XZ) increased from 54 MB to 78 MB

--

CPP-7098

Co-authored-by: Balazs Benics <balazs.benics@<!-- -->sonarsource.com>

---
Full diff: https://github.com/llvm/llvm-project/pull/161663.diff


4 Files Affected:

- (modified) clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h (+1-1) 
- (modified) clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp (+24-7) 
- (modified) clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp (+11-1) 
- (modified) clang/test/Analysis/analyzer-stats/entry-point-stats.cpp (+6-2) 


``````````diff
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
index 633fb7aa8f72d..448e40269ca2d 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
@@ -25,7 +25,7 @@ class EntryPointStat {
 public:
   llvm::StringLiteral name() const { return Name; }
 
-  static void lockRegistry();
+  static void lockRegistry(llvm::StringRef CPPFileName);
 
   static void takeSnapshot(const Decl *EntryPoint);
   static void dumpStatsAsCSV(llvm::raw_ostream &OS);
diff --git a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
index b7f9044f65308..62ae62f2f2154 100644
--- a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
+++ b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
@@ -9,7 +9,9 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
+#include "clang/Index/USRGeneration.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
@@ -38,6 +40,7 @@ struct Registry {
   };
 
   std::vector<Snapshot> Snapshots;
+  std::string EscapedCPPFileName;
 };
 } // namespace
 
@@ -69,7 +72,7 @@ static void checkStatName(const EntryPointStat *M) {
   }
 }
 
-void EntryPointStat::lockRegistry() {
+void EntryPointStat::lockRegistry(llvm::StringRef CPPFileName) {
   auto CmpByNames = [](const EntryPointStat *L, const EntryPointStat *R) {
     return L->name() < R->name();
   };
@@ -78,6 +81,8 @@ void EntryPointStat::lockRegistry() {
   enumerateStatVectors(
       [](const auto &Stats) { llvm::for_each(Stats, checkStatName); });
   StatsRegistry->IsLocked = true;
+  llvm::raw_string_ostream OS(StatsRegistry->EscapedCPPFileName);
+  llvm::printEscapedString(CPPFileName, OS);
 }
 
 [[maybe_unused]] static bool isRegistered(llvm::StringLiteral Name) {
@@ -144,15 +149,27 @@ static std::vector<llvm::StringLiteral> getStatNames() {
   return Ret;
 }
 
+static std::string getUSR(const Decl *D) {
+  llvm::SmallVector<char> Buf;
+  if (index::generateUSRForDecl(D, Buf)) {
+    assert(false && "This should never fail");
+    return AnalysisDeclContext::getFunctionName(D);
+  }
+  return llvm::toStringRef(Buf).str();
+}
+
 void Registry::Snapshot::dumpAsCSV(llvm::raw_ostream &OS) const {
   OS << '"';
+  llvm::printEscapedString(getUSR(EntryPoint), OS);
+  OS << "\",\"";
+  OS << StatsRegistry->EscapedCPPFileName << "\",\"";
   llvm::printEscapedString(
       clang::AnalysisDeclContext::getFunctionName(EntryPoint), OS);
-  OS << "\", ";
+  OS << "\",";
   auto PrintAsBool = [&OS](bool B) { OS << (B ? "true" : "false"); };
-  llvm::interleaveComma(BoolStatValues, OS, PrintAsBool);
-  OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ", ");
-  llvm::interleaveComma(UnsignedStatValues, OS);
+  llvm::interleave(BoolStatValues, OS, PrintAsBool, ",");
+  OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ",");
+  llvm::interleave(UnsignedStatValues, OS, [&OS](unsigned U) { OS << U; }, ",");
 }
 
 static std::vector<bool> consumeBoolStats() {
@@ -181,8 +198,8 @@ void EntryPointStat::dumpStatsAsCSV(llvm::StringRef FileName) {
 }
 
 void EntryPointStat::dumpStatsAsCSV(llvm::raw_ostream &OS) {
-  OS << "EntryPoint, ";
-  llvm::interleaveComma(getStatNames(), OS);
+  OS << "USR,File,DebugName,";
+  llvm::interleave(getStatNames(), OS, [&OS](const auto &a) { OS << a; }, ",");
   OS << "\n";
 
   std::vector<std::string> Rows;
diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index 53466e7a75b0f..9e7538eb34600 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -65,6 +65,15 @@ STAT_MAX(MaxCFGSize, "The maximum number of basic blocks in a function.");
 
 namespace {
 
+StringRef getMainFileName(const CompilerInvocation &Invocation) {
+  if (!Invocation.getFrontendOpts().Inputs.empty()) {
+    const FrontendInputFile &Input = Invocation.getFrontendOpts().Inputs[0];
+    return Input.isFile() ? Input.getFile()
+                          : Input.getBuffer().getBufferIdentifier();
+  }
+  return {};
+}
+
 class AnalysisConsumer : public AnalysisASTConsumer,
                          public DynamicRecursiveASTVisitor {
   enum {
@@ -125,7 +134,8 @@ class AnalysisConsumer : public AnalysisASTConsumer,
         PP(CI.getPreprocessor()), OutDir(outdir), Opts(opts), Plugins(plugins),
         Injector(std::move(injector)), CTU(CI),
         MacroExpansions(CI.getLangOpts()) {
-    EntryPointStat::lockRegistry();
+
+    EntryPointStat::lockRegistry(getMainFileName(CI.getInvocation()));
     DigestAnalyzerOptions();
 
     if (Opts.AnalyzerDisplayProgress || Opts.PrintStats ||
diff --git a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
index 1ff31d114ee99..9cbe04550a8d3 100644
--- a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
+++ b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
@@ -5,7 +5,9 @@
 // RUN: %csv2json "%t.csv" | FileCheck --check-prefix=CHECK %s
 //
 // CHECK:      {
-// CHECK-NEXT:   "fib(unsigned int)": {
+// CHECK-NEXT:   "c:@F at fib#i#": {
+// CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
+// CHECK-NEXT:     "DebugName": "fib(unsigned int)",
 // CHECK-NEXT:     "NumBlocks": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumBlocksUnreachable": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumCTUSteps": "{{[0-9]+}}",
@@ -40,7 +42,9 @@
 // CHECK-NEXT:     "MaxValidBugClassSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "PathRunningTime": "{{[0-9]+}}"
 // CHECK-NEXT:   },
-// CHECK-NEXT:   "main(int, char **)": {
+// CHECK-NEXT:   "c:@F at main#I#**C#": {
+// CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
+// CHECK-NEXT:     "DebugName": "main(int, char **)",
 // CHECK-NEXT:     "NumBlocks": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumBlocksUnreachable": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumCTUSteps": "{{[0-9]+}}",

``````````

</details>


https://github.com/llvm/llvm-project/pull/161663


More information about the cfe-commits mailing list