r279378 - [analyzer] Use faster hashing (MD5) in CloneDetector.

Artem Dergachev via cfe-commits cfe-commits at lists.llvm.org
Sat Aug 20 10:35:54 PDT 2016


Author: dergachev
Date: Sat Aug 20 12:35:53 2016
New Revision: 279378

URL: http://llvm.org/viewvc/llvm-project?rev=279378&view=rev
Log:
[analyzer] Use faster hashing (MD5) in CloneDetector.

This replaces the old approach of fingerprinting every AST node into a string,
which avoided collisions and was simple to implement, but turned out to be
extremely ineffective with respect to both performance and memory.

The collisions are now dealt with in a separate pass, which no longer causes
performance problems because collisions are rare.

Patch by Raphael Isemann!

Differential Revision: https://reviews.llvm.org/D22515

Modified:
    cfe/trunk/include/clang/Analysis/CloneDetection.h
    cfe/trunk/lib/Analysis/CloneDetection.cpp
    cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp

Modified: cfe/trunk/include/clang/Analysis/CloneDetection.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Analysis/CloneDetection.h?rev=279378&r1=279377&r2=279378&view=diff
==============================================================================
--- cfe/trunk/include/clang/Analysis/CloneDetection.h (original)
+++ cfe/trunk/include/clang/Analysis/CloneDetection.h Sat Aug 20 12:35:53 2016
@@ -16,6 +16,7 @@
 #define LLVM_CLANG_AST_CLONEDETECTION_H
 
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/StringMap.h"
 
 #include <vector>
@@ -163,11 +164,20 @@ public:
   /// Holds the data about a StmtSequence that is needed during the search for
   /// code clones.
   struct CloneSignature {
-    /// \brief Holds all relevant data of a StmtSequence.
+    /// \brief The hash code of the StmtSequence.
     ///
-    /// If this variable is equal for two different StmtSequences, then they can
-    /// be considered clones of each other.
-    std::vector<DataPiece> Data;
+    /// The initial clone groups that are formed during the search for clones
+    /// consist only of Sequences that share the same hash code. This makes this
+    /// value the central part of this heuristic that is needed to find clones
+    /// in a performant way. For this to work, the type of this variable
+    /// always needs to be small and fast to compare.
+    ///
+    /// Also, StmtSequences that are clones of each others have to share
+    /// the same hash code. StmtSequences that are not clones of each other
+    /// shouldn't share the same hash code, but if they do, it will only
+    /// degrade the performance of the hash search but doesn't influence
+    /// the correctness of the result.
+    size_t Hash;
 
     /// \brief The complexity of the StmtSequence.
     ///
@@ -186,14 +196,8 @@ public:
     /// \brief Creates an empty CloneSignature without any data.
     CloneSignature() : Complexity(1) {}
 
-    CloneSignature(const std::vector<unsigned> &Data, unsigned Complexity)
-        : Data(Data), Complexity(Complexity) {}
-
-    /// \brief Adds the data from the given CloneSignature to this one.
-    void add(const CloneSignature &Other) {
-      Data.insert(Data.end(), Other.Data.begin(), Other.Data.end());
-      Complexity += Other.Complexity;
-    }
+    CloneSignature(llvm::hash_code Hash, unsigned Complexity)
+        : Hash(Hash), Complexity(Complexity) {}
   };
 
   /// Holds group of StmtSequences that are clones of each other and the
@@ -201,10 +205,12 @@ public:
   /// StmtSequences have in common.
   struct CloneGroup {
     std::vector<StmtSequence> Sequences;
-    unsigned Complexity;
+    CloneSignature Signature;
+
+    CloneGroup() {}
 
-    CloneGroup(const StmtSequence &Seq, unsigned Complexity)
-        : Complexity(Complexity) {
+    CloneGroup(const StmtSequence &Seq, CloneSignature Signature)
+        : Signature(Signature) {
       Sequences.push_back(Seq);
     }
 
@@ -269,11 +275,8 @@ public:
                             unsigned MinGroupComplexity);
 
 private:
-  /// Stores all found clone groups including invalid groups with only a single
-  /// statement.
-  std::vector<CloneGroup> CloneGroups;
-  /// Maps search data to its related index in the \p CloneGroups vector.
-  llvm::StringMap<std::size_t> CloneGroupIndexes;
+  /// Stores all encountered StmtSequences alongside their CloneSignature.
+  std::vector<std::pair<CloneSignature, StmtSequence>> Sequences;
 };
 
 } // end namespace clang

Modified: cfe/trunk/lib/Analysis/CloneDetection.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Analysis/CloneDetection.cpp?rev=279378&r1=279377&r2=279378&view=diff
==============================================================================
--- cfe/trunk/lib/Analysis/CloneDetection.cpp (original)
+++ cfe/trunk/lib/Analysis/CloneDetection.cpp Sat Aug 20 12:35:53 2016
@@ -19,6 +19,7 @@
 #include "clang/AST/StmtVisitor.h"
 #include "clang/Lex/Lexer.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace clang;
@@ -279,47 +280,35 @@ namespace {
 /// This class defines what a code clone is: If it collects for two statements
 /// the same data, then those two statements are considered to be clones of each
 /// other.
-class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector> {
+///
+/// All collected data is forwarded to the given data consumer of the type T.
+/// The data consumer class needs to provide a member method with the signature:
+///   update(StringRef Str)
+template <typename T>
+class StmtDataCollector : public ConstStmtVisitor<StmtDataCollector<T>> {
 
   ASTContext &Context;
-  std::vector<CloneDetector::DataPiece> &CollectedData;
+  /// \brief The data sink to which all data is forwarded.
+  T &DataConsumer;
 
 public:
   /// \brief Collects data of the given Stmt.
   /// \param S The given statement.
   /// \param Context The ASTContext of S.
-  /// \param D The given data vector to which all collected data is appended.
-  StmtDataCollector(const Stmt *S, ASTContext &Context,
-                    std::vector<CloneDetector::DataPiece> &D)
-      : Context(Context), CollectedData(D) {
-    Visit(S);
+  /// \param D The data sink to which all data is forwarded.
+  StmtDataCollector(const Stmt *S, ASTContext &Context, T &DataConsumer)
+      : Context(Context), DataConsumer(DataConsumer) {
+    this->Visit(S);
   }
 
   // Below are utility methods for appending different data to the vector.
 
   void addData(CloneDetector::DataPiece Integer) {
-    CollectedData.push_back(Integer);
+    DataConsumer.update(
+        StringRef(reinterpret_cast<char *>(&Integer), sizeof(Integer)));
   }
 
-  // FIXME: The functions below add long strings to the data vector which are
-  // probably not good for performance. Replace the strings with pointer values
-  // or a some other unique integer.
-
-  void addData(llvm::StringRef Str) {
-    if (Str.empty())
-      return;
-
-    const size_t OldSize = CollectedData.size();
-
-    const size_t PieceSize = sizeof(CloneDetector::DataPiece);
-    // Calculate how many vector units we need to accomodate all string bytes.
-    size_t RoundedUpPieceNumber = (Str.size() + PieceSize - 1) / PieceSize;
-    // Allocate space for the string in the data vector.
-    CollectedData.resize(CollectedData.size() + RoundedUpPieceNumber);
-
-    // Copy the string to the allocated space at the end of the vector.
-    std::memcpy(CollectedData.data() + OldSize, Str.data(), Str.size());
-  }
+  void addData(llvm::StringRef Str) { DataConsumer.update(Str); }
 
   void addData(const QualType &QT) { addData(QT.getAsString()); }
 
@@ -484,8 +473,10 @@ class CloneSignatureGenerator {
     // Create an empty signature that will be filled in this method.
     CloneDetector::CloneSignature Signature;
 
-    // Collect all relevant data from S and put it into the empty signature.
-    StmtDataCollector(S, Context, Signature.Data);
+    llvm::MD5 Hash;
+
+    // Collect all relevant data from S and hash it.
+    StmtDataCollector<llvm::MD5>(S, Context, Hash);
 
     // Look up what macros expanded into the current statement.
     std::string StartMacroStack = getMacroStack(S->getLocStart(), Context);
@@ -523,7 +514,9 @@ class CloneSignatureGenerator {
       auto ChildSignature = generateSignatures(Child, StartMacroStack);
 
       // Add the collected data to the signature of the current statement.
-      Signature.add(ChildSignature);
+      Signature.Complexity += ChildSignature.Complexity;
+      Hash.update(StringRef(reinterpret_cast<char *>(&ChildSignature.Hash),
+                            sizeof(ChildSignature.Hash)));
 
       // If the current statement is a CompoundStatement, we need to store the
       // signature for the generation of the sub-sequences.
@@ -536,6 +529,14 @@ class CloneSignatureGenerator {
     if (CS)
       handleSubSequences(CS, ChildSignatures);
 
+    // Create the final hash code for the current signature.
+    llvm::MD5::MD5Result HashResult;
+    Hash.final(HashResult);
+
+    // Copy as much of the generated hash code to the signature's hash code.
+    std::memcpy(&Signature.Hash, &HashResult,
+                std::min(sizeof(Signature.Hash), sizeof(HashResult)));
+
     // Save the signature for the current statement in the CloneDetector object.
     CD.add(StmtSequence(S, Context), Signature);
 
@@ -564,11 +565,24 @@ class CloneSignatureGenerator {
         // Create an empty signature and add the signatures of all selected
         // child statements to it.
         CloneDetector::CloneSignature SubSignature;
+        llvm::MD5 SubHash;
 
         for (unsigned i = Pos; i < Pos + Length; ++i) {
-          SubSignature.add(ChildSignatures[i]);
+          SubSignature.Complexity += ChildSignatures[i].Complexity;
+          size_t ChildHash = ChildSignatures[i].Hash;
+
+          SubHash.update(StringRef(reinterpret_cast<char *>(&ChildHash),
+                                sizeof(ChildHash)));
         }
 
+        // Create the final hash code for the current signature.
+        llvm::MD5::MD5Result HashResult;
+        SubHash.final(HashResult);
+
+        // Copy as much of the generated hash code to the signature's hash code.
+        std::memcpy(&SubSignature.Hash, &HashResult,
+                    std::min(sizeof(SubSignature.Hash), sizeof(HashResult)));
+
         // Save the signature together with the information about what children
         // sequence we selected.
         CD.add(StmtSequence(CS, Context, Pos, Pos + Length), SubSignature);
@@ -594,25 +608,7 @@ void CloneDetector::analyzeCodeBody(cons
 
 void CloneDetector::add(const StmtSequence &S,
                         const CloneSignature &Signature) {
-  // StringMap only works with StringRefs, so we create one for our data vector.
-  auto &Data = Signature.Data;
-  StringRef DataRef = StringRef(reinterpret_cast<const char *>(Data.data()),
-                                Data.size() * sizeof(unsigned));
-
-  // Search with the help of the signature if we already have encountered a
-  // clone of the given StmtSequence.
-  auto I = CloneGroupIndexes.find(DataRef);
-  if (I == CloneGroupIndexes.end()) {
-    // We haven't found an existing clone group, so we create a new clone group
-    // for this StmtSequence and store the index of it in our search map.
-    CloneGroupIndexes[DataRef] = CloneGroups.size();
-    CloneGroups.emplace_back(S, Signature.Complexity);
-    return;
-  }
-
-  // We have found an existing clone group and can expand it with the given
-  // StmtSequence.
-  CloneGroups[I->getValue()].Sequences.push_back(S);
+  Sequences.push_back(std::make_pair(Signature, S));
 }
 
 namespace {
@@ -645,14 +641,66 @@ bool containsGroup(CloneDetector::CloneG
 }
 } // end anonymous namespace
 
+namespace {
+/// \brief Wrapper around FoldingSetNodeID that it can be used as the template
+///        argument of the StmtDataCollector.
+class FoldingSetNodeIDWrapper {
+
+  llvm::FoldingSetNodeID &FS;
+
+public:
+  FoldingSetNodeIDWrapper(llvm::FoldingSetNodeID &FS) : FS(FS) {}
+
+  void update(StringRef Str) { FS.AddString(Str); }
+};
+} // end anonymous namespace
+
+/// \brief Writes the relevant data from all statements and child statements
+///        in the given StmtSequence into the given FoldingSetNodeID.
+static void CollectStmtSequenceData(const StmtSequence &Sequence,
+                                    FoldingSetNodeIDWrapper &OutputData) {
+  for (const Stmt *S : Sequence) {
+    StmtDataCollector<FoldingSetNodeIDWrapper>(S, Sequence.getASTContext(),
+                                               OutputData);
+
+    for (const Stmt *Child : S->children()) {
+      if (!Child)
+        continue;
+
+      CollectStmtSequenceData(StmtSequence(Child, Sequence.getASTContext()),
+                              OutputData);
+    }
+  }
+}
+
+/// \brief Returns true if both sequences are clones of each other.
+static bool areSequencesClones(const StmtSequence &LHS,
+                               const StmtSequence &RHS) {
+  // We collect the data from all statements in the sequence as we did before
+  // when generating a hash value for each sequence. But this time we don't
+  // hash the collected data and compare the whole data set instead. This
+  // prevents any false-positives due to hash code collisions.
+  llvm::FoldingSetNodeID DataLHS, DataRHS;
+  FoldingSetNodeIDWrapper LHSWrapper(DataLHS);
+  FoldingSetNodeIDWrapper RHSWrapper(DataRHS);
+
+  CollectStmtSequenceData(LHS, LHSWrapper);
+  CollectStmtSequenceData(RHS, RHSWrapper);
+
+  return DataLHS == DataRHS;
+}
+
 /// \brief Finds all actual clone groups in a single group of presumed clones.
-/// \param Result Output parameter to which all found groups are added. Every
-///               clone in a group that was added this way follows the same
-///               variable pattern as the other clones in its group.
-/// \param Group A group of clones. The clones are allowed to have a different
-///              variable pattern.
+/// \param Result Output parameter to which all found groups are added.
+/// \param Group A group of presumed clones. The clones are allowed to have a
+///              different variable pattern and may not be actual clones of each
+///              other.
+/// \param CheckVariablePatterns If true, every clone in a group that was added
+///              to the output follows the same variable pattern as the other
+///              clones in its group.
 static void createCloneGroups(std::vector<CloneDetector::CloneGroup> &Result,
-                              const CloneDetector::CloneGroup &Group) {
+                              const CloneDetector::CloneGroup &Group,
+                              bool CheckVariablePattern) {
   // We remove the Sequences one by one, so a list is more appropriate.
   std::list<StmtSequence> UnassignedSequences(Group.Sequences.begin(),
                                               Group.Sequences.end());
@@ -664,7 +712,7 @@ static void createCloneGroups(std::vecto
     StmtSequence Prototype = UnassignedSequences.front();
     UnassignedSequences.pop_front();
 
-    CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Complexity);
+    CloneDetector::CloneGroup FilteredGroup(Prototype, Group.Signature);
 
     // Analyze the variable pattern of the prototype. Every other StmtSequence
     // needs to have the same pattern to get into the new clone group.
@@ -674,12 +722,27 @@ static void createCloneGroups(std::vecto
     // and assign them to our new clone group.
     auto I = UnassignedSequences.begin(), E = UnassignedSequences.end();
     while (I != E) {
+      // If the sequence doesn't fit to the prototype, we have encountered
+      // an unintended hash code collision and we skip it.
+      if (!areSequencesClones(Prototype, *I)) {
+        ++I;
+        continue;
+      }
 
-      if (VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) {
+      // If we weren't asked to check for a matching variable pattern in clone
+      // groups we can add the sequence now to the new clone group.
+      // If we were asked to check for matching variable pattern, we first have
+      // to check that there are no differences between the two patterns and
+      // only proceed if they match.
+      if (!CheckVariablePattern ||
+          VariablePattern(*I).countPatternDifferences(PrototypeFeatures) == 0) {
         FilteredGroup.Sequences.push_back(*I);
         I = UnassignedSequences.erase(I);
         continue;
       }
+
+      // We didn't found a matching variable pattern, so we continue with the
+      // next sequence.
       ++I;
     }
 
@@ -694,14 +757,76 @@ static void createCloneGroups(std::vecto
 void CloneDetector::findClones(std::vector<CloneGroup> &Result,
                                unsigned MinGroupComplexity,
                                bool CheckPatterns) {
+  // A shortcut (and necessary for the for-loop later in this function).
+  if (Sequences.empty())
+    return;
+
+  // We need to search for groups of StmtSequences with the same hash code to
+  // create our initial clone groups. By sorting all known StmtSequences by
+  // their hash value we make sure that StmtSequences with the same hash code
+  // are grouped together in the Sequences vector.
+  // Note: We stable sort here because the StmtSequences are added in the order
+  // in which they appear in the source file. We want to preserve that order
+  // because we also want to report them in that order in the CloneChecker.
+  std::stable_sort(Sequences.begin(), Sequences.end(),
+                   [](std::pair<CloneSignature, StmtSequence> LHS,
+                      std::pair<CloneSignature, StmtSequence> RHS) {
+                     return LHS.first.Hash < RHS.first.Hash;
+                   });
+
+  std::vector<CloneGroup> CloneGroups;
+
+  // Check for each CloneSignature if its successor has the same hash value.
+  // We don't check the last CloneSignature as it has no successor.
+  // Note: The 'size - 1' in the condition is safe because we check for an empty
+  // Sequences vector at the beginning of this function.
+  for (unsigned i = 0; i < Sequences.size() - 1; ++i) {
+    const auto Current = Sequences[i];
+    const auto Next = Sequences[i + 1];
+
+    if (Current.first.Hash != Next.first.Hash)
+      continue;
+
+    // It's likely that we just found an sequence of CloneSignatures that
+    // represent a CloneGroup, so we create a new group and start checking and
+    // adding the CloneSignatures in this sequence.
+    CloneGroup Group;
+    Group.Signature = Current.first;
+
+    for (; i < Sequences.size(); ++i) {
+      const auto &Signature = Sequences[i];
+
+      // A different hash value means we have reached the end of the sequence.
+      if (Current.first.Hash != Signature.first.Hash) {
+        // The current Signature could be the start of a new CloneGroup. So we
+        // decrement i so that we visit it again in the outer loop.
+        // Note: i can never be 0 at this point because we are just comparing
+        // the hash of the Current CloneSignature with itself in the 'if' above.
+        assert(i != 0);
+        --i;
+        break;
+      }
+
+      // Skip CloneSignatures that won't pass the complexity requirement.
+      if (Signature.first.Complexity < MinGroupComplexity)
+        continue;
+
+      Group.Sequences.push_back(Signature.second);
+    }
+
+    // There is a chance that we haven't found more than two fitting
+    // CloneSignature because not enough CloneSignatures passed the complexity
+    // requirement. As a CloneGroup with less than two members makes no sense,
+    // we ignore this CloneGroup and won't add it to the result.
+    if (!Group.isValid())
+      continue;
+
+    CloneGroups.push_back(Group);
+  }
+
   // Add every valid clone group that fulfills the complexity requirement.
   for (const CloneGroup &Group : CloneGroups) {
-    if (Group.isValid() && Group.Complexity >= MinGroupComplexity) {
-      if (CheckPatterns)
-        createCloneGroups(Result, Group);
-      else
-        Result.push_back(Group);
-    }
+    createCloneGroups(Result, Group, CheckPatterns);
   }
 
   std::vector<unsigned> IndexesToRemove;

Modified: cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp?rev=279378&r1=279377&r2=279378&view=diff
==============================================================================
--- cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp (original)
+++ cfe/trunk/lib/StaticAnalyzer/Checkers/CloneChecker.cpp Sat Aug 20 12:35:53 2016
@@ -91,15 +91,6 @@ void CloneChecker::reportClones(SourceMa
                                                "Related code clone is here.");
 
   for (CloneDetector::CloneGroup &Group : CloneGroups) {
-    // For readability reasons we sort the clones by line numbers.
-    std::sort(Group.Sequences.begin(), Group.Sequences.end(),
-              [&SM](const StmtSequence &LHS, const StmtSequence &RHS) {
-                return SM.isBeforeInTranslationUnit(LHS.getStartLoc(),
-                                                    RHS.getStartLoc()) &&
-                       SM.isBeforeInTranslationUnit(LHS.getEndLoc(),
-                                                    RHS.getEndLoc());
-              });
-
     // We group the clones by printing the first as a warning and all others
     // as a note.
     DiagEngine.Report(Group.Sequences.front().getStartLoc(), WarnID);




More information about the cfe-commits mailing list