r279056 - [analyzer] Teach CloneDetector to find clones that look like copy-paste errors.

Artem Dergachev via cfe-commits cfe-commits at lists.llvm.org
Thu Aug 18 05:29:42 PDT 2016


Author: dergachev
Date: Thu Aug 18 07:29:41 2016
New Revision: 279056

URL: http://llvm.org/viewvc/llvm-project?rev=279056&view=rev
Log:
[analyzer] Teach CloneDetector to find clones that look like copy-paste errors.

The original clone checker tries to find copy-pasted code that is exactly
identical to the original code, up to minor details.

As an example, if the copy-pasted code has all references to variable 'a'
replaced with references to variable 'b', it is still considered to be
an exact clone.

The new check finds copy-pasted code in which exactly one variable seems
out of place compared to the original code, which likely indicates
a copy-paste error (a variable was forgotten to be renamed in one place).

Patch by Raphael Isemann!

Differential Revision: https://reviews.llvm.org/D23314

Added:
    cfe/trunk/test/Analysis/copypaste/suspicious-clones.cpp
Modified:
    cfe/trunk/include/clang/Analysis/CloneDetection.h
    cfe/trunk/lib/Analysis/CloneDetection.cpp
    cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp

Modified: cfe/trunk/include/clang/Analysis/CloneDetection.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Analysis/CloneDetection.h?rev=279056&r1=279055&r2=279056&view=diff
==============================================================================
--- cfe/trunk/include/clang/Analysis/CloneDetection.h (original)
+++ cfe/trunk/include/clang/Analysis/CloneDetection.h Thu Aug 18 07:29:41 2016
@@ -24,6 +24,7 @@ namespace clang {
 
 class Stmt;
 class Decl;
+class VarDecl;
 class ASTContext;
 class CompoundStmt;
 
@@ -222,7 +223,43 @@ public:
   ///               that were identified to be clones of each other.
   /// \param MinGroupComplexity Only return clones which have at least this
   ///                           complexity value.
-  void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity);
+  /// \param CheckPatterns Returns only clone groups in which the referenced
+  ///                      variables follow the same pattern.
+  void findClones(std::vector<CloneGroup> &Result, unsigned MinGroupComplexity,
+                  bool CheckPatterns = true);
+
+  /// \brief Describes two clones that reference their variables in a different
+  ///        pattern which could indicate a programming error.
+  struct SuspiciousClonePair {
+    /// \brief Utility class holding the relevant information about a single
+    ///        clone in this pair.
+    struct SuspiciousCloneInfo {
+      /// The variable which referencing in this clone was against the pattern.
+      const VarDecl *Variable;
+      /// Where the variable was referenced.
+      SourceRange VarRange;
+      /// The variable that should have been referenced to follow the pattern.
+      /// If Suggestion is a nullptr then it's not possible to fix the pattern
+      /// by referencing a different variable in this clone.
+      const VarDecl *Suggestion;
+      SuspiciousCloneInfo(const VarDecl *Variable, SourceRange Range,
+                          const VarDecl *Suggestion)
+          : Variable(Variable), VarRange(Range), Suggestion(Suggestion) {}
+      SuspiciousCloneInfo() {}
+    };
+    /// The first clone in the pair which always has a suggested variable.
+    SuspiciousCloneInfo FirstCloneInfo;
+    /// This other clone in the pair which can have a suggested variable.
+    SuspiciousCloneInfo SecondCloneInfo;
+  };
+
+  /// \brief Searches the provided statements for pairs of clones that don't
+  ///        follow the same pattern when referencing variables.
+  /// \param Result Output parameter that will contain the clone pairs.
+  /// \param MinGroupComplexity Only clone pairs in which the clones have at
+  ///                           least this complexity value.
+  void findSuspiciousClones(std::vector<SuspiciousClonePair> &Result,
+                            unsigned MinGroupComplexity);
 
 private:
   /// Stores all found clone groups including invalid groups with only a single

Modified: cfe/trunk/lib/Analysis/CloneDetection.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/CloneDetection.cpp?rev=279056&r1=279055&r2=279056&view=diff
==============================================================================
--- cfe/trunk/lib/Analysis/CloneDetection.cpp (original)
+++ cfe/trunk/lib/Analysis/CloneDetection.cpp Thu Aug 18 07:29:41 2016
@@ -88,8 +88,11 @@ class VariablePattern {
   struct VariableOccurence {
     /// The index of the associated VarDecl in the Variables vector.
     size_t KindID;
+    /// The source range in the code where the variable was referenced.
+    SourceRange Range;
 
-    VariableOccurence(size_t KindID) : KindID(KindID) {}
+    VariableOccurence(size_t KindID, SourceRange Range)
+        : KindID(KindID), Range(Range) {}
   };
 
   /// All occurences of referenced variables in the order of appearance.
@@ -100,19 +103,20 @@ class VariablePattern {
 
   /// \brief Adds a new variable referenced to this pattern.
   /// \param VarDecl The declaration of the variable that is referenced.
-  void addVariableOccurence(const VarDecl *VarDecl) {
+  /// \param Range The SourceRange where this variable is referenced.
+  void addVariableOccurence(const VarDecl *VarDecl, SourceRange Range) {
     // First check if we already reference this variable
     for (size_t KindIndex = 0; KindIndex < Variables.size(); ++KindIndex) {
       if (Variables[KindIndex] == VarDecl) {
         // If yes, add a new occurence that points to the existing entry in
         // the Variables vector.
-        Occurences.emplace_back(KindIndex);
+        Occurences.emplace_back(KindIndex, Range);
         return;
       }
     }
     // If this variable wasn't already referenced, add it to the list of
     // referenced variables and add a occurence that points to this new entry.
-    Occurences.emplace_back(Variables.size());
+    Occurences.emplace_back(Variables.size(), Range);
     Variables.push_back(VarDecl);
   }
 
@@ -127,7 +131,7 @@ class VariablePattern {
     // Check if S is a reference to a variable. If yes, add it to the pattern.
     if (auto D = dyn_cast<DeclRefExpr>(S)) {
       if (auto VD = dyn_cast<VarDecl>(D->getDecl()->getCanonicalDecl()))
-        addVariableOccurence(VD);
+        addVariableOccurence(VD, D->getSourceRange());
     }
 
     // Recursively check all children of the given statement.
@@ -144,33 +148,93 @@ public:
       addVariables(S);
   }
 
-  /// \brief Compares this pattern with the given one.
+  /// \brief Counts the differences between this pattern and the given one.
   /// \param Other The given VariablePattern to compare with.
-  /// \return Returns true if and only if the references variables in this
-  ///         object follow the same pattern than the ones in the given
-  ///         VariablePattern.
+  /// \param FirstMismatch Output parameter that will be filled with information
+  ///        about the first difference between the two patterns. This parameter
+  ///        can be a nullptr, in which case it will be ignored.
+  /// \return Returns the number of differences between the pattern this object
+  ///         is following and the given VariablePattern.
   ///
-  /// For example, the following statements all have the same pattern:
+  /// For example, the following statements all have the same pattern and this
+  /// function would return zero:
   ///
   ///   if (a < b) return a; return b;
   ///   if (x < y) return x; return y;
   ///   if (u2 < u1) return u2; return u1;
   ///
-  /// but the following statement has a different pattern (note the changed
-  /// variables in the return statements).
+  /// But the following statement has a different pattern (note the changed
+  /// variables in the return statements) and would have two differences when
+  /// compared with one of the statements above.
   ///
   ///   if (a < b) return b; return a;
   ///
   /// This function should only be called if the related statements of the given
   /// pattern and the statements of this objects are clones of each other.
-  bool comparePattern(const VariablePattern &Other) {
+  unsigned countPatternDifferences(
+      const VariablePattern &Other,
+      CloneDetector::SuspiciousClonePair *FirstMismatch = nullptr) {
+    unsigned NumberOfDifferences = 0;
+
     assert(Other.Occurences.size() == Occurences.size());
     for (unsigned i = 0; i < Occurences.size(); ++i) {
-      if (Occurences[i].KindID != Other.Occurences[i].KindID) {
-        return false;
-      }
+      auto ThisOccurence = Occurences[i];
+      auto OtherOccurence = Other.Occurences[i];
+      if (ThisOccurence.KindID == OtherOccurence.KindID)
+        continue;
+
+      ++NumberOfDifferences;
+
+      // If FirstMismatch is not a nullptr, we need to store information about
+      // the first difference between the two patterns.
+      if (FirstMismatch == nullptr)
+        continue;
+
+      // Only proceed if we just found the first difference as we only store
+      // information about the first difference.
+      if (NumberOfDifferences != 1)
+        continue;
+
+      const VarDecl *FirstSuggestion = nullptr;
+      // If there is a variable available in the list of referenced variables
+      // which wouldn't break the pattern if it is used in place of the
+      // current variable, we provide this variable as the suggested fix.
+      if (OtherOccurence.KindID < Variables.size())
+        FirstSuggestion = Variables[OtherOccurence.KindID];
+
+      // Store information about the first clone.
+      FirstMismatch->FirstCloneInfo =
+          CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo(
+              Variables[ThisOccurence.KindID], ThisOccurence.Range,
+              FirstSuggestion);
+
+      // Same as above but with the other clone. We do this for both clones as
+      // we don't know which clone is the one containing the unintended
+      // pattern error.
+      const VarDecl *SecondSuggestion = nullptr;
+      if (ThisOccurence.KindID < Other.Variables.size())
+        SecondSuggestion = Other.Variables[ThisOccurence.KindID];
+
+      // Store information about the second clone.
+      FirstMismatch->SecondCloneInfo =
+          CloneDetector::SuspiciousClonePair::SuspiciousCloneInfo(
+              Variables[ThisOccurence.KindID], OtherOccurence.Range,
+              SecondSuggestion);
+
+      // SuspiciousClonePair guarantees that the first clone always has a
+      // suggested variable associated with it. As we know that one of the two
+      // clones in the pair always has suggestion, we swap the two clones
+      // in case the first clone has no suggested variable which means that
+      // the second clone has a suggested variable and should be first.
+      if (!FirstMismatch->FirstCloneInfo.Suggestion)
+        std::swap(FirstMismatch->FirstCloneInfo,
+                  FirstMismatch->SecondCloneInfo);
+
+      // This ensures that we always have at least one suggestion in a pair.
+      assert(FirstMismatch->FirstCloneInfo.Suggestion);
     }
-    return true;
+
+    return NumberOfDifferences;
   }
 };
 }
@@ -529,7 +593,8 @@ static void createCloneGroups(std::vecto
     // and assign them to our new clone group.
     auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
     while (I != E) {
-      if (VariablePattern(*I).comparePattern(PrototypeFeatures)) {
+
+      if (VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) {
         FilteredGroup.Sequences.push_back(*I);
         I = UnassignedSequences.erase(I);
         continue;
@@ -546,11 +611,15 @@ static void createCloneGroups(std::vecto
 }
 
 void CloneDetector::findClones(std::vector<CloneGroup> &Result,
-                               unsigned MinGroupComplexity) {
+                               unsigned MinGroupComplexity,
+                               bool CheckPatterns) {
   // Add every valid clone group that fulfills the complexity requirement.
   for (const CloneGroup &Group : CloneGroups) {
     if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
-      createCloneGroups(Result, Group);
+      if (CheckPatterns)
+        createCloneGroups(Result, Group);
+      else
+        Result.push_back(Group);
     }
   }
 
@@ -580,3 +649,37 @@ void CloneDetector::findClones(std::vect
     Result.erase(Result.begin() + *I);
   }
 }
+
+void CloneDetector::findSuspiciousClones(
+    std::vector<CloneDetector::SuspiciousClonePair> &Result,
+    unsigned MinGroupComplexity) {
+  std::vector<CloneGroup> Clones;
+  // Reuse the normal search for clones but specify that the clone groups don't
+  // need to have a common referenced variable pattern so that we can manually
+  // search for the kind of pattern errors this function is supposed to find.
+  findClones(Clones, MinGroupComplexity, false);
+
+  for (const CloneGroup &Group : Clones) {
+    for (unsigned i = 0; i < Group.Sequences.size(); ++i) {
+      VariablePattern PatternA(Group.Sequences[i]);
+
+      for (unsigned j = i + 1; j < Group.Sequences.size(); ++j) {
+        VariablePattern PatternB(Group.Sequences[j]);
+
+        CloneDetector::SuspiciousClonePair ClonePair;
+        // For now, we only report clones which break the variable pattern just
+        // once because multiple differences in a pattern are an indicator that
+        // those differences are maybe intended (e.g. because it's actually
+        // a different algorithm).
+        // TODO: In very big clones even multiple variables can be unintended,
+        // so replacing this number with a percentage could better handle such
+        // cases. On the other hand it could increase the false-positive rate
+        // for all clones if the percentage is too high.
+        if (PatternA.countPatternDifferences(PatternB, &ClonePair) == 1) {
+          Result.push_back(ClonePair);
+          break;
+        }
+      }
+    }
+  }
+}

Modified: cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp?rev=279056&r1=279055&r2=279056&view=diff
==============================================================================
--- cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp (original)
+++ cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp Thu Aug 18 07:29:41 2016
@@ -34,6 +34,15 @@ public:
 
   void checkEndOfTranslationUnit(const TranslationUnitDecl *TU,
                                  AnalysisManager &Mgr, BugReporter &BR) const;
+
+  /// \brief Reports all clones to the user.
+  void reportClones(SourceManager &SM, AnalysisManager &Mgr,
+                    int MinComplexity) const;
+
+  /// \brief Reports only suspicious clones to the user along with informaton
+  ///        that explain why they are suspicious.
+  void reportSuspiciousClones(SourceManager &SM, AnalysisManager &Mgr,
+                              int MinComplexity) const;
 };
 } // end anonymous namespace
 
@@ -52,10 +61,23 @@ void CloneChecker::checkEndOfTranslation
 
   int MinComplexity = Mgr.getAnalyzerOptions().getOptionAsInteger(
       "MinimumCloneComplexity", 10, this);
-
   assert(MinComplexity >= 0);
 
-  SourceManager &SM = BR.getSourceManager();
+  bool ReportSuspiciousClones = Mgr.getAnalyzerOptions().getBooleanOption(
+      "ReportSuspiciousClones", true, this);
+
+  bool ReportNormalClones = Mgr.getAnalyzerOptions().getBooleanOption(
+      "ReportNormalClones", true, this);
+
+  if (ReportSuspiciousClones)
+    reportSuspiciousClones(BR.getSourceManager(), Mgr, MinComplexity);
+
+  if (ReportNormalClones)
+    reportClones(BR.getSourceManager(), Mgr, MinComplexity);
+}
+
+void CloneChecker::reportClones(SourceManager &SM, AnalysisManager &Mgr,
+                                int MinComplexity) const {
 
   std::vector<CloneDetector::CloneGroup> CloneGroups;
   Detector.findClones(CloneGroups, MinComplexity);
@@ -87,6 +109,52 @@ void CloneChecker::checkEndOfTranslation
   }
 }
 
+void CloneChecker::reportSuspiciousClones(SourceManager &SM,
+                                          AnalysisManager &Mgr,
+                                          int MinComplexity) const {
+
+  std::vector<CloneDetector::SuspiciousClonePair> Clones;
+  Detector.findSuspiciousClones(Clones, MinComplexity);
+
+  DiagnosticsEngine &DiagEngine = Mgr.getDiagnostic();
+
+  auto SuspiciousCloneWarning = DiagEngine.getCustomDiagID(
+      DiagnosticsEngine::Warning, "suspicious code clone detected; did you "
+                                  "mean to use %0?");
+
+  auto RelatedCloneNote = DiagEngine.getCustomDiagID(
+      DiagnosticsEngine::Note, "suggestion is based on the usage of this "
+                               "variable in a similar piece of code");
+
+  auto RelatedSuspiciousCloneNote = DiagEngine.getCustomDiagID(
+      DiagnosticsEngine::Note, "suggestion is based on the usage of this "
+                               "variable in a similar piece of code; did you "
+                               "mean to use %0?");
+
+  for (CloneDetector::SuspiciousClonePair &Pair : Clones) {
+    // The first clone always has a suggestion and we report it to the user
+    // along with the place where the suggestion should be used.
+    DiagEngine.Report(Pair.FirstCloneInfo.VarRange.getBegin(),
+                      SuspiciousCloneWarning)
+        << Pair.FirstCloneInfo.VarRange << Pair.FirstCloneInfo.Suggestion;
+
+    // The second clone can have a suggestion and if there is one, we report
+    // that suggestion to the user.
+    if (Pair.SecondCloneInfo.Suggestion) {
+      DiagEngine.Report(Pair.SecondCloneInfo.VarRange.getBegin(),
+                        RelatedSuspiciousCloneNote)
+          << Pair.SecondCloneInfo.VarRange << Pair.SecondCloneInfo.Suggestion;
+      continue;
+    }
+
+    // If there isn't a suggestion in the second clone, we only inform the
+    // user where we got the idea that his code could contain an error.
+    DiagEngine.Report(Pair.SecondCloneInfo.VarRange.getBegin(),
+                      RelatedCloneNote)
+        << Pair.SecondCloneInfo.VarRange;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Register CloneChecker
 //===----------------------------------------------------------------------===//

Added: cfe/trunk/test/Analysis/copypaste/suspicious-clones.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/Analysis/copypaste/suspicious-clones.cpp?rev=279056&view=auto
==============================================================================
--- cfe/trunk/test/Analysis/copypaste/suspicious-clones.cpp (added)
+++ cfe/trunk/test/Analysis/copypaste/suspicious-clones.cpp Thu Aug 18 07:29:41 2016
@@ -0,0 +1,97 @@
+// RUN: %clang_cc1 -analyze -analyzer-checker=alpha.clone.CloneChecker -analyzer-config alpha.clone.CloneChecker:ReportSuspiciousClones=true  -analyzer-config alpha.clone.CloneChecker:ReportNormalClones=false -verify %s
+
+// Tests finding a suspicious clone that references local variables.
+
+void log();
+
+int max(int a, int b) {
+  log();
+  if (a > b)
+    return a;
+  return b; // expected-note{{suggestion is based on the usage of this variable in a similar piece of code}}
+}
+
+int maxClone(int x, int y, int z) {
+  log();
+  if (x > y)
+    return x;
+  return z; // expected-warning{{suspicious code clone detected; did you mean to use 'y'?}}
+}
+
+// Tests finding a suspicious clone that references global variables.
+
+struct mutex {
+  bool try_lock();
+  void unlock();
+};
+
+mutex m1;
+mutex m2;
+int i;
+
+void busyIncrement() {
+  while (true) {
+    if (m1.try_lock()) {
+      ++i;
+      m1.unlock(); // expected-note{{suggestion is based on the usage of this variable in a similar piece of code}}
+      if (i > 1000) {
+        return;
+      }
+    }
+  }
+}
+
+void faultyBusyIncrement() {
+  while (true) {
+    if (m1.try_lock()) {
+      ++i;
+      m2.unlock();  // expected-warning{{suspicious code clone detected; did you mean to use 'm1'?}}
+      if (i > 1000) {
+        return;
+      }
+    }
+  }
+}
+
+// Tests that we provide two suggestions in cases where two fixes are possible.
+
+int foo(int a, int b, int c) {
+  a += b + c;
+  b /= a + b;
+  c -= b * a; // expected-warning{{suspicious code clone detected; did you mean to use 'a'?}}
+  return c;
+}
+
+int fooClone(int a, int b, int c) {
+  a += b + c;
+  b /= a + b;
+  c -= a * a; // expected-note{{suggestion is based on the usage of this variable in a similar piece of code; did you mean to use 'b'?}}
+  return c;
+}
+
+
+// Tests that for clone groups with a many possible suspicious clone pairs, at
+// most one warning per clone group is generated and every relevant clone is
+// reported through either a warning or a note.
+
+long bar1(long a, long b, long c, long d) {
+  c = a - b;
+  c = c / d * a;
+  d = b * b - c; // expected-warning{{suspicious code clone detected; did you mean to use 'c'?}}
+  return d;
+}
+
+long bar2(long a, long b, long c, long d) {
+  c = a - b;
+  c = c / d * a;
+  d = c * b - c; // expected-note{{suggestion is based on the usage of this variable in a similar piece of code; did you mean to use 'b'?}} \
+                 // expected-warning{{suspicious code clone detected; did you mean to use 'a'?}}
+  return d;
+}
+
+long bar3(long a, long b, long c, long d) {
+  c = a - b;
+  c = c / d * a;
+  d = a * b - c; // expected-note{{suggestion is based on the usage of this variable in a similar piece of code; did you mean to use 'c'?}}
+  return d;
+}




More information about the cfe-commits mailing list