[clang] cf86ef9 - [clang][analyzer] Make per-entry-point metric rows uniquely identifiable

via cfe-commits cfe-commits at lists.llvm.org
Fri Oct 3 07:29:46 PDT 2025


Author: Arseniy Zaostrovnykh
Date: 2025-10-03T16:29:41+02:00
New Revision: cf86ef925d75ef08526fc399a2888673338298b3

URL: https://github.com/llvm/llvm-project/commit/cf86ef925d75ef08526fc399a2888673338298b3
DIFF: https://github.com/llvm/llvm-project/commit/cf86ef925d75ef08526fc399a2888673338298b3.diff

LOG: [clang][analyzer] Make per-entry-point metric rows uniquely identifiable

Also remove the gratuitous spaces after "," that break strict CSV
compliance.

As the debug function name does not uniquely identify an entry point,
add the main-TU name and the USR values for each entry point snapshot to
reduce the likelihood of collisions between declarations across large
projects.

While adding a filename to each row increases the file size
substantially, the difference in size for the compressed is acceptable.

I evaluated it on our set of 200+ open source C and C++ projects with 3M
entry points, and got the following results when adding these two
columns:
- Raw CSV file increased from 530MB to 1.1GB
- Compressed file (XZ) increased from 54 MB to 78 MB

--

CPP-7098

---------

Co-authored-by: Balazs Benics <balazs.benics at sonarsource.com>

Added: 
    

Modified: 
    clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
    clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
    clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
    clang/test/Analysis/analyzer-stats/entry-point-stats.cpp

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
index 633fb7aa8f72d..448e40269ca2d 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h
@@ -25,7 +25,7 @@ class EntryPointStat {
 public:
   llvm::StringLiteral name() const { return Name; }
 
-  static void lockRegistry();
+  static void lockRegistry(llvm::StringRef CPPFileName);
 
   static void takeSnapshot(const Decl *EntryPoint);
   static void dumpStatsAsCSV(llvm::raw_ostream &OS);

diff  --git a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
index b7f9044f65308..62ae62f2f2154 100644
--- a/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
+++ b/clang/lib/StaticAnalyzer/Core/EntryPointStats.cpp
@@ -9,7 +9,9 @@
 #include "clang/StaticAnalyzer/Core/PathSensitive/EntryPointStats.h"
 #include "clang/AST/DeclBase.h"
 #include "clang/Analysis/AnalysisDeclContext.h"
+#include "clang/Index/USRGeneration.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FileSystem.h"
@@ -38,6 +40,7 @@ struct Registry {
   };
 
   std::vector<Snapshot> Snapshots;
+  std::string EscapedCPPFileName;
 };
 } // namespace
 
@@ -69,7 +72,7 @@ static void checkStatName(const EntryPointStat *M) {
   }
 }
 
-void EntryPointStat::lockRegistry() {
+void EntryPointStat::lockRegistry(llvm::StringRef CPPFileName) {
   auto CmpByNames = [](const EntryPointStat *L, const EntryPointStat *R) {
     return L->name() < R->name();
   };
@@ -78,6 +81,8 @@ void EntryPointStat::lockRegistry() {
   enumerateStatVectors(
       [](const auto &Stats) { llvm::for_each(Stats, checkStatName); });
   StatsRegistry->IsLocked = true;
+  llvm::raw_string_ostream OS(StatsRegistry->EscapedCPPFileName);
+  llvm::printEscapedString(CPPFileName, OS);
 }
 
 [[maybe_unused]] static bool isRegistered(llvm::StringLiteral Name) {
@@ -144,15 +149,27 @@ static std::vector<llvm::StringLiteral> getStatNames() {
   return Ret;
 }
 
+static std::string getUSR(const Decl *D) {
+  llvm::SmallVector<char> Buf;
+  if (index::generateUSRForDecl(D, Buf)) {
+    assert(false && "This should never fail");
+    return AnalysisDeclContext::getFunctionName(D);
+  }
+  return llvm::toStringRef(Buf).str();
+}
+
 void Registry::Snapshot::dumpAsCSV(llvm::raw_ostream &OS) const {
   OS << '"';
+  llvm::printEscapedString(getUSR(EntryPoint), OS);
+  OS << "\",\"";
+  OS << StatsRegistry->EscapedCPPFileName << "\",\"";
   llvm::printEscapedString(
       clang::AnalysisDeclContext::getFunctionName(EntryPoint), OS);
-  OS << "\", ";
+  OS << "\",";
   auto PrintAsBool = [&OS](bool B) { OS << (B ? "true" : "false"); };
-  llvm::interleaveComma(BoolStatValues, OS, PrintAsBool);
-  OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ", ");
-  llvm::interleaveComma(UnsignedStatValues, OS);
+  llvm::interleave(BoolStatValues, OS, PrintAsBool, ",");
+  OS << ((BoolStatValues.empty() || UnsignedStatValues.empty()) ? "" : ",");
+  llvm::interleave(UnsignedStatValues, OS, [&OS](unsigned U) { OS << U; }, ",");
 }
 
 static std::vector<bool> consumeBoolStats() {
@@ -181,8 +198,8 @@ void EntryPointStat::dumpStatsAsCSV(llvm::StringRef FileName) {
 }
 
 void EntryPointStat::dumpStatsAsCSV(llvm::raw_ostream &OS) {
-  OS << "EntryPoint, ";
-  llvm::interleaveComma(getStatNames(), OS);
+  OS << "USR,File,DebugName,";
+  llvm::interleave(getStatNames(), OS, [&OS](const auto &a) { OS << a; }, ",");
   OS << "\n";
 
   std::vector<std::string> Rows;

diff  --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index e9ba374851726..e9b9b0bc187bc 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -65,6 +65,15 @@ STAT_MAX(MaxCFGSize, "The maximum number of basic blocks in a function.");
 
 namespace {
 
+StringRef getMainFileName(const CompilerInvocation &Invocation) {
+  if (!Invocation.getFrontendOpts().Inputs.empty()) {
+    const FrontendInputFile &Input = Invocation.getFrontendOpts().Inputs[0];
+    return Input.isFile() ? Input.getFile()
+                          : Input.getBuffer().getBufferIdentifier();
+  }
+  return "<no input>";
+}
+
 class AnalysisConsumer : public AnalysisASTConsumer,
                          public DynamicRecursiveASTVisitor {
   enum {
@@ -125,7 +134,8 @@ class AnalysisConsumer : public AnalysisASTConsumer,
         PP(CI.getPreprocessor()), OutDir(outdir), Opts(opts), Plugins(plugins),
         Injector(std::move(injector)), CTU(CI),
         MacroExpansions(CI.getLangOpts()) {
-    EntryPointStat::lockRegistry();
+
+    EntryPointStat::lockRegistry(getMainFileName(CI.getInvocation()));
     DigestAnalyzerOptions();
 
     if (Opts.AnalyzerDisplayProgress || Opts.PrintStats ||

diff  --git a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
index 1ff31d114ee99..9cbe04550a8d3 100644
--- a/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
+++ b/clang/test/Analysis/analyzer-stats/entry-point-stats.cpp
@@ -5,7 +5,9 @@
 // RUN: %csv2json "%t.csv" | FileCheck --check-prefix=CHECK %s
 //
 // CHECK:      {
-// CHECK-NEXT:   "fib(unsigned int)": {
+// CHECK-NEXT:   "c:@F at fib#i#": {
+// CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
+// CHECK-NEXT:     "DebugName": "fib(unsigned int)",
 // CHECK-NEXT:     "NumBlocks": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumBlocksUnreachable": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumCTUSteps": "{{[0-9]+}}",
@@ -40,7 +42,9 @@
 // CHECK-NEXT:     "MaxValidBugClassSize": "{{[0-9]+}}",
 // CHECK-NEXT:     "PathRunningTime": "{{[0-9]+}}"
 // CHECK-NEXT:   },
-// CHECK-NEXT:   "main(int, char **)": {
+// CHECK-NEXT:   "c:@F at main#I#**C#": {
+// CHECK-NEXT:     "File": "{{.*}}entry-point-stats.cpp",
+// CHECK-NEXT:     "DebugName": "main(int, char **)",
 // CHECK-NEXT:     "NumBlocks": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumBlocksUnreachable": "{{[0-9]+}}",
 // CHECK-NEXT:     "NumCTUSteps": "{{[0-9]+}}",


        


More information about the cfe-commits mailing list