[clang] 7309359 - [analyzer] Fix scan-build report deduplication.

Artem Dergachev via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 26 13:39:26 PDT 2021


Author: Artem Dergachev
Date: 2021-08-26T13:34:29-07:00
New Revision: 73093599287cc6d546ac46652ee781789d7de61f

URL: https://github.com/llvm/llvm-project/commit/73093599287cc6d546ac46652ee781789d7de61f
DIFF: https://github.com/llvm/llvm-project/commit/73093599287cc6d546ac46652ee781789d7de61f.diff

LOG: [analyzer] Fix scan-build report deduplication.

The previous behavior was to deduplicate reports based on md5 of the
html file. This algorithm might have worked originally but right now
HTML reports contain information rich enough to make them virtually
always distinct which breaks deduplication entirely.

The new strategy is to (finally) take advantage of IssueHash - the
stable report identifier provided by clang that is the same if and only if
the reports are duplicates of each other.

Additionally, scan-build no longer performs deduplication on its own.
Instead, the report file name is now based on the issue hash,
and clang instances will silently refuse to produce a new html file
when a duplicate already exists. This eliminates the problem entirely.

The '-analyzer-config stable-report-filename' option is deprecated
because report filenames are no longer unstable. A new option is
introduced, '-analyzer-config verbose-report-filename', to produce
verbose file names that look similar to the old "stable" file names.
The old option acts as an alias to the new option.

Differential Revision: https://reviews.llvm.org/D105167

Added: 
    clang/test/Analysis/scan-build/Inputs/deduplication/1.c
    clang/test/Analysis/scan-build/Inputs/deduplication/2.c
    clang/test/Analysis/scan-build/Inputs/deduplication/header.h
    clang/test/Analysis/scan-build/deduplication.test
    clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-3.html

Modified: 
    clang/include/clang/Analysis/PathDiagnostic.h
    clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
    clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
    clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
    clang/test/Analysis/analyzer-config.c
    clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test
    clang/tools/scan-build/bin/scan-build

Removed: 
    clang/test/Analysis/scan-build/rebuild_index/report-3.html
    clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-4.html


################################################################################
diff  --git a/clang/include/clang/Analysis/PathDiagnostic.h b/clang/include/clang/Analysis/PathDiagnostic.h
index 04bef1fa5334d..14860c1168e21 100644
--- a/clang/include/clang/Analysis/PathDiagnostic.h
+++ b/clang/include/clang/Analysis/PathDiagnostic.h
@@ -75,14 +75,8 @@ struct PathDiagnosticConsumerOptions {
   bool ShouldSerializeStats = false;
 
   /// If the consumer intends to produce multiple output files, should it
-  /// use randomly generated file names for these files (with the tiny risk of
-  /// having random collisions) or deterministic human-readable file names
-  /// (with a larger risk of deterministic collisions or invalid characters
-  /// in the file name). We should not really give this choice to the users
-  /// because deterministic mode is always superior when done right, but
-  /// for some consumers this mode is experimental and needs to be
-  /// off by default.
-  bool ShouldWriteStableReportFilename = false;
+  /// use a pseudo-random file name name or a human-readable file name.
+  bool ShouldWriteVerboseReportFilename = false;
 
   /// Whether the consumer should treat consumed diagnostics as hard errors.
   /// Useful for breaking your build when issues are found.

diff  --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
index f0359d2dbb3c2..e97e0a6892a93 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
@@ -190,7 +190,13 @@ ANALYZER_OPTION(bool, ShouldReportIssuesInMainSourceFile,
                 false)
 
 ANALYZER_OPTION(bool, ShouldWriteStableReportFilename, "stable-report-filename",
-                "Whether or not the report filename should be random or not.",
+                "Deprecated: report filenames are now always stable. "
+                "See also 'verbose-report-filename'.",
+                false)
+
+ANALYZER_OPTION(bool, ShouldWriteVerboseReportFilename, "verbose-report-filename",
+                "Whether or not the report filename should contain extra "
+                "information about the issue.",
                 false)
 
 ANALYZER_OPTION(

diff  --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index ccf35e0a81eca..7514eee7244f8 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -395,7 +395,11 @@ class AnalyzerOptions : public RefCountedBase<AnalyzerOptions> {
     return {FullCompilerInvocation,
             ShouldDisplayMacroExpansions,
             ShouldSerializeStats,
-            ShouldWriteStableReportFilename,
+            // The stable report filename option is deprecated because
+            // file names are now always stable. Now the old option acts as
+            // an alias to the new verbose filename option because this
+            // closely mimics the behavior under the old option.
+            ShouldWriteStableReportFilename || ShouldWriteVerboseReportFilename,
             AnalyzerWerror,
             ShouldApplyFixIts,
             ShouldDisplayCheckerNameForText};

diff  --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
index c90046ffb4131..e7df9a70839de 100644
--- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
+++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp
@@ -245,6 +245,18 @@ void HTMLDiagnostics::FlushDiagnosticsImpl(
     ReportDiag(*Diag, filesMade);
 }
 
+static llvm::SmallString<32> getIssueHash(const PathDiagnostic &D,
+                                          const Preprocessor &PP) {
+  SourceManager &SMgr = PP.getSourceManager();
+  PathDiagnosticLocation UPDLoc = D.getUniqueingLoc();
+  FullSourceLoc L(SMgr.getExpansionLoc(UPDLoc.isValid()
+                                           ? UPDLoc.asLocation()
+                                           : D.getLocation().asLocation()),
+                  SMgr);
+  return getIssueHash(L, D.getCheckerName(), D.getBugType(),
+                      D.getDeclWithIssue(), PP.getLangOpts());
+}
+
 void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
                                  FilesMade *filesMade) {
   // Create the HTML directory if it is missing.
@@ -271,11 +283,6 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
   // Create a new rewriter to generate HTML.
   Rewriter R(const_cast<SourceManager&>(SMgr), PP.getLangOpts());
 
-  // The file for the first path element is considered the main report file, it
-  // will usually be equivalent to SMgr.getMainFileID(); however, it might be a
-  // header when -analyzer-opt-analyze-headers is used.
-  FileID ReportFile = path.front()->getLocation().asLocation().getExpansionLoc().getFileID();
-
   // Get the function/method name
   SmallString<128> declName("unknown");
   int offsetDecl = 0;
@@ -302,46 +309,52 @@ void HTMLDiagnostics::ReportDiag(const PathDiagnostic& D,
 
   // Create a path for the target HTML file.
   int FD;
-  SmallString<128> Model, ResultPath;
-
-  if (!DiagOpts.ShouldWriteStableReportFilename) {
-      llvm::sys::path::append(Model, Directory, "report-%%%%%%.html");
-      if (std::error_code EC =
-          llvm::sys::fs::make_absolute(Model)) {
-          llvm::errs() << "warning: could not make '" << Model
-                       << "' absolute: " << EC.message() << '\n';
-        return;
-      }
-      if (std::error_code EC = llvm::sys::fs::createUniqueFile(
-              Model, FD, ResultPath, llvm::sys::fs::OF_Text)) {
-        llvm::errs() << "warning: could not create file in '" << Directory
-                     << "': " << EC.message() << '\n';
-        return;
-      }
-  } else {
-      int i = 1;
-      std::error_code EC;
-      do {
-          // Find a filename which is not already used
-          const FileEntry* Entry = SMgr.getFileEntryForID(ReportFile);
-          std::stringstream filename;
-          Model = "";
-          filename << "report-"
-                   << llvm::sys::path::filename(Entry->getName()).str()
-                   << "-" << declName.c_str()
-                   << "-" << offsetDecl
-                   << "-" << i << ".html";
-          llvm::sys::path::append(Model, Directory,
-                                  filename.str());
-          EC = llvm::sys::fs::openFileForReadWrite(
-              Model, FD, llvm::sys::fs::CD_CreateNew, llvm::sys::fs::OF_None);
-          if (EC && EC != llvm::errc::file_exists) {
-              llvm::errs() << "warning: could not create file '" << Model
-                           << "': " << EC.message() << '\n';
-              return;
-          }
-          i++;
-      } while (EC);
+
+  SmallString<128> FileNameStr;
+  llvm::raw_svector_ostream FileName(FileNameStr);
+  FileName << "report-";
+
+  // Historically, neither the stable report filename nor the unstable report
+  // filename were actually stable. That said, the stable report filename
+  // was more stable because it was mostly composed of information
+  // about the bug report instead of being completely random.
+  // Now both stable and unstable report filenames are in fact stable
+  // but the stable report filename is still more verbose.
+  if (DiagOpts.ShouldWriteVerboseReportFilename) {
+    // FIXME: This code relies on knowing what constitutes the issue hash.
+    // Otherwise deduplication won't work correctly.
+    FileID ReportFile =
+        path.back()->getLocation().asLocation().getExpansionLoc().getFileID();
+
+    const FileEntry *Entry = SMgr.getFileEntryForID(ReportFile);
+
+    FileName << llvm::sys::path::filename(Entry->getName()).str() << "-"
+             << declName.c_str() << "-" << offsetDecl << "-";
+  }
+
+  FileName << StringRef(getIssueHash(D, PP)).substr(0, 6).str() << ".html";
+
+  SmallString<128> ResultPath;
+  llvm::sys::path::append(ResultPath, Directory, FileName.str());
+  if (std::error_code EC = llvm::sys::fs::make_absolute(ResultPath)) {
+    llvm::errs() << "warning: could not make '" << ResultPath
+                 << "' absolute: " << EC.message() << '\n';
+    return;
+  }
+
+  if (std::error_code EC = llvm::sys::fs::openFileForReadWrite(
+          ResultPath, FD, llvm::sys::fs::CD_CreateNew,
+          llvm::sys::fs::OF_None)) {
+    // Existence of the file corresponds to the situation where a 
diff erent
+    // Clang instance has emitted a bug report with the same issue hash.
+    // This is an entirely normal situation that does not deserve a warning,
+    // as apart from hash collisions this can happen because the reports
+    // are in fact similar enough to be considered duplicates of each other.
+    if (EC != llvm::errc::file_exists) {
+      llvm::errs() << "warning: could not create file in '" << Directory
+                   << "': " << EC.message() << '\n';
+    }
+    return;
   }
 
   llvm::raw_fd_ostream os(FD, true);
@@ -638,7 +651,6 @@ void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic& D, Rewriter &R,
                                              ? UPDLoc.asLocation()
                                              : D.getLocation().asLocation()),
                     SMgr);
-    const Decl *DeclWithIssue = D.getDeclWithIssue();
 
     StringRef BugCategory = D.getCategory();
     if (!BugCategory.empty())
@@ -650,9 +662,7 @@ void HTMLDiagnostics::FinalizeHTML(const PathDiagnostic& D, Rewriter &R,
 
     os  << "\n<!-- FUNCTIONNAME " <<  declName << " -->\n";
 
-    os << "\n<!-- ISSUEHASHCONTENTOFLINEINCONTEXT "
-       << getIssueHash(L, D.getCheckerName(), D.getBugType(), DeclWithIssue,
-                       PP.getLangOpts())
+    os << "\n<!-- ISSUEHASHCONTENTOFLINEINCONTEXT " << getIssueHash(D, PP)
        << " -->\n";
 
     os << "\n<!-- BUGLINE "

diff  --git a/clang/test/Analysis/analyzer-config.c b/clang/test/Analysis/analyzer-config.c
index 2a41e692dd59b..c9f9b438bfbb7 100644
--- a/clang/test/Analysis/analyzer-config.c
+++ b/clang/test/Analysis/analyzer-config.c
@@ -119,4 +119,5 @@
 // CHECK-NEXT: unix.DynamicMemoryModeling:AddNoOwnershipChangeNotes = false
 // CHECK-NEXT: unix.DynamicMemoryModeling:Optimistic = false
 // CHECK-NEXT: unroll-loops = false
+// CHECK-NEXT: verbose-report-filename = false
 // CHECK-NEXT: widen-loops = false

diff  --git a/clang/test/Analysis/scan-build/Inputs/deduplication/1.c b/clang/test/Analysis/scan-build/Inputs/deduplication/1.c
new file mode 100644
index 0000000000000..ec2f9ec95e100
--- /dev/null
+++ b/clang/test/Analysis/scan-build/Inputs/deduplication/1.c
@@ -0,0 +1,5 @@
+#include "header.h"
+
+void bar() {
+  foo();
+}

diff  --git a/clang/test/Analysis/scan-build/Inputs/deduplication/2.c b/clang/test/Analysis/scan-build/Inputs/deduplication/2.c
new file mode 100644
index 0000000000000..ec2f9ec95e100
--- /dev/null
+++ b/clang/test/Analysis/scan-build/Inputs/deduplication/2.c
@@ -0,0 +1,5 @@
+#include "header.h"
+
+void bar() {
+  foo();
+}

diff  --git a/clang/test/Analysis/scan-build/Inputs/deduplication/header.h b/clang/test/Analysis/scan-build/Inputs/deduplication/header.h
new file mode 100644
index 0000000000000..e94e65d6b4e0b
--- /dev/null
+++ b/clang/test/Analysis/scan-build/Inputs/deduplication/header.h
@@ -0,0 +1,4 @@
+int foo() {
+  int x = 0;
+  return 1 / x;
+}

diff  --git a/clang/test/Analysis/scan-build/deduplication.test b/clang/test/Analysis/scan-build/deduplication.test
new file mode 100644
index 0000000000000..56d888e5fc12a
--- /dev/null
+++ b/clang/test/Analysis/scan-build/deduplication.test
@@ -0,0 +1,40 @@
+// FIXME: Actually, "perl".
+REQUIRES: shell
+
+RUN: rm -rf %t.output_dir && mkdir %t.output_dir
+RUN: %scan-build -o %t.output_dir \
+RUN:             %clang -S %S/Inputs/deduplication/1.c \
+RUN:                       %S/Inputs/deduplication/2.c \
+RUN:     | FileCheck %s -check-prefix CHECK-STDOUT
+
+RUN: ls %t.output_dir/*/ | FileCheck %s -check-prefix CHECK-FILENAMES
+
+RUN: rm -rf %t.output_dir && mkdir %t.output_dir
+RUN: %scan-build -o %t.output_dir \
+RUN:             -analyzer-config stable-report-filename=true \
+RUN:             %clang -S %S/Inputs/deduplication/1.c \
+RUN:                       %S/Inputs/deduplication/2.c \
+RUN:     | FileCheck %s -check-prefix CHECK-STDOUT
+
+RUN: ls %t.output_dir/*/ | FileCheck %s -check-prefix CHECK-FILENAMES
+
+RUN: rm -rf %t.output_dir && mkdir %t.output_dir
+RUN: %scan-build -o %t.output_dir \
+RUN:             -analyzer-config verbose-report-filename=true \
+RUN:             %clang -S %S/Inputs/deduplication/1.c \
+RUN:                       %S/Inputs/deduplication/2.c \
+RUN:     | FileCheck %s -check-prefix CHECK-STDOUT
+
+RUN: ls %t.output_dir/*/ | FileCheck %s -check-prefix CHECK-FILENAMES
+
+// Only one report file should be generated.
+
+CHECK-STDOUT: scan-build: Using '{{.*}}' for static analysis
+CHECK-STDOUT: scan-build: 1 bug found.
+CHECK-STDOUT: scan-build: Run 'scan-view {{.*}}' to examine bug reports.
+
+
+CHECK-FILENAMES: index.html
+CHECK-FILENAMES-NEXT: report-{{.*}}.html
+CHECK-FILENAMES-NEXT: scanview.css
+CHECK-FILENAMES-NEXT: sorttable.js

diff  --git a/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test b/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test
index df07a2618d498..ab70435c60542 100644
--- a/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test
+++ b/clang/test/Analysis/scan-build/rebuild_index/rebuild_index.test
@@ -4,9 +4,8 @@ REQUIRES: shell
 RUN: rm -rf %t.output_dir && mkdir %t.output_dir
 RUN: cp %S/report-1.html %t.output_dir
 RUN: cp %S/report-2.html %t.output_dir
-RUN: cp %S/report-3.html %t.output_dir
 RUN: mkdir %t.output_dir/subdirectory
-RUN: cp %S/subdirectory/report-4.html %t.output_dir/subdirectory
+RUN: cp %S/subdirectory/report-3.html %t.output_dir/subdirectory
 
 RUN: %scan-build --generate-index-only %t.output_dir
 
@@ -15,16 +14,13 @@ RUN: ls %t.output_dir | FileCheck -check-prefix CHECK-FILES %s
 CHECK-FILES:      index.html
 CHECK-FILES-NEXT: report-1.html
 CHECK-FILES-NEXT: report-2.html
-
-// report-3.html is a duplicate of report-1.html so it's not present.
-CHECK-FILES-NOT:  report-3.html
 CHECK-FILES-NEXT: scanview.css
 CHECK-FILES-NEXT: sorttable.js
 CHECK-FILES-NEXT: subdirectory
 
 RUN: ls %t.output_dir/subdirectory | FileCheck -check-prefix CHECK-SUB %s
 
-CHECK-SUB: report-4.html
+CHECK-SUB: report-3.html
 
 RUN: cat %t.output_dir/index.html | FileCheck -check-prefix CHECK-INDEX %s
 
@@ -32,10 +28,9 @@ CHECK-INDEX:      cat1
 CHECK-INDEX-NEXT: bug1
 CHECK-INDEX-NEXT: cat2
 CHECK-INDEX-NEXT: bug2
-CHECK-INDEX-NEXT: cat4
-CHECK-INDEX-NEXT: bug4
+CHECK-INDEX-NEXT: cat3
+CHECK-INDEX-NEXT: bug3
 
 CHECK-INDEX:     report-1.html#EndPath
 CHECK-INDEX:     report-2.html#EndPath
-CHECK-INDEX-NOT: report-3.html#EndPath
-CHECK-INDEX:     subdirectory/report-4.html#EndPath
+CHECK-INDEX:     subdirectory/report-3.html#EndPath

diff  --git a/clang/test/Analysis/scan-build/rebuild_index/report-3.html b/clang/test/Analysis/scan-build/rebuild_index/report-3.html
deleted file mode 100644
index ba446ae51ff9b..0000000000000
--- a/clang/test/Analysis/scan-build/rebuild_index/report-3.html
+++ /dev/null
@@ -1,8 +0,0 @@
-<!-- BUGTYPE bug1 -->
-<!-- BUGFILE file1 -->
-<!-- BUGPATHLENGTH 1 -->
-<!-- BUGLINE 1 -->
-<!-- BUGCATEGORY cat1 -->
-<!-- BUGDESC desc1 -->
-<!-- FUNCTIONNAME func1 -->
-<!-- BUGMETAEND -->

diff  --git a/clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-3.html b/clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-3.html
new file mode 100644
index 0000000000000..63ad78536959d
--- /dev/null
+++ b/clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-3.html
@@ -0,0 +1,8 @@
+<!-- BUGTYPE bug3 -->
+<!-- BUGFILE file3 -->
+<!-- BUGPATHLENGTH 3 -->
+<!-- BUGLINE 3 -->
+<!-- BUGCATEGORY cat3 -->
+<!-- BUGDESC desc3 -->
+<!-- FUNCTIONNAME func3 -->
+<!-- BUGMETAEND -->

diff  --git a/clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-4.html b/clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-4.html
deleted file mode 100644
index 63c7e28c2d16c..0000000000000
--- a/clang/test/Analysis/scan-build/rebuild_index/subdirectory/report-4.html
+++ /dev/null
@@ -1,8 +0,0 @@
-<!-- BUGTYPE bug4 -->
-<!-- BUGFILE file4 -->
-<!-- BUGPATHLENGTH 4 -->
-<!-- BUGLINE 4 -->
-<!-- BUGCATEGORY cat4 -->
-<!-- BUGDESC desc4 -->
-<!-- FUNCTIONNAME func4 -->
-<!-- BUGMETAEND -->

diff  --git a/clang/tools/scan-build/bin/scan-build b/clang/tools/scan-build/bin/scan-build
index 645f5507b6fa0..71e7ced087199 100755
--- a/clang/tools/scan-build/bin/scan-build
+++ b/clang/tools/scan-build/bin/scan-build
@@ -14,7 +14,6 @@
 use strict;
 use warnings;
 use FindBin qw($RealBin);
-use Digest::MD5;
 use File::Basename;
 use File::Find;
 use File::Copy qw(copy);
@@ -268,27 +267,6 @@ sub SetHtmlEnv {
   $ENV{'CCC_ANALYZER_HTML'} = $Dir;
 }
 
-##----------------------------------------------------------------------------##
-# ComputeDigest - Compute a digest of the specified file.
-##----------------------------------------------------------------------------##
-
-sub ComputeDigest {
-  my $FName = shift;
-  DieDiag("Cannot read $FName to compute Digest.\n") if (! -r $FName);
-
-  # Use Digest::MD5.  We don't have to be cryptographically secure.  We're
-  # just looking for duplicate files that come from a non-malicious source.
-  # We use Digest::MD5 because it is a standard Perl module that should
-  # come bundled on most systems.
-  open(FILE, $FName) or DieDiag("Cannot open $FName when computing Digest.\n");
-  binmode FILE;
-  my $Result = Digest::MD5->new->addfile(*FILE)->hexdigest;
-  close(FILE);
-
-  # Return the digest.
-  return $Result;
-}
-
 ##----------------------------------------------------------------------------##
 #  UpdatePrefix - Compute the common prefix of files.
 ##----------------------------------------------------------------------------##
@@ -374,8 +352,6 @@ sub AddStatLine {
 # Sometimes a source file is scanned more than once, and thus produces
 # multiple error reports.  We use a cache to solve this problem.
 
-my %AlreadyScanned;
-
 sub ScanFile {
 
   my $Index = shift;
@@ -383,19 +359,6 @@ sub ScanFile {
   my $FName = shift;
   my $Stats = shift;
 
-  # Compute a digest for the report file.  Determine if we have already
-  # scanned a file that looks just like it.
-
-  my $digest = ComputeDigest("$Dir/$FName");
-
-  if (defined $AlreadyScanned{$digest}) {
-    # Redundant file.  Remove it.
-    unlink("$Dir/$FName");
-    return;
-  }
-
-  $AlreadyScanned{$digest} = 1;
-
   # At this point the report file is not world readable.  Make it happen.
   chmod(0644, "$Dir/$FName");
 


        


More information about the cfe-commits mailing list