[llvm] 28134a2 - [NFCi][MergeFunctions] Consolidate Hashing Functions

Fri Aug 18 13:19:23 PDT 2023

Author: Aiden Grossman
Date: 2023-08-18T13:15:12-07:00
New Revision: 28134a29fdedd8972acdfb39223571ddcc15dc59

URL: https://github.com/llvm/llvm-project/commit/28134a29fdedd8972acdfb39223571ddcc15dc59
DIFF: https://github.com/llvm/llvm-project/commit/28134a29fdedd8972acdfb39223571ddcc15dc59.diff

LOG: [NFCi][MergeFunctions] Consolidate Hashing Functions

A couple years ago, StructuralHash was created, copying the exact
hashing implementation from FunctionComparator (minus a couple small
details/refactorings). Since then, the hashing implementation has not
diverged, but several other areas, like unit testing, have diverged
significantly, with StructuralHash getting more attention in these
areas. This patch aims to consolidate the two hashing functions into
StructuralHash given they do the exact same thing and having less
divergence in areas like unit testing would be beneficial.

The original aim at creating a separate StructuralHash was to make the
implementation divergent and capture additional details like instruction
operands (which neither hashing implementation does currently). The
MergeFunctions pass doesn't need these detaisl, but verification of pass
return values would benefit from this additional data. Setting an option
to calculate these values would allow for divergent behavior where
appropriate while reducing code duplication with little runtime
overhead.

Reviewed By: aeubanks

Differential Revision: https://reviews.llvm.org/D158217

Added: 
    

Modified: 
    llvm/include/llvm/IR/StructuralHash.h
    llvm/include/llvm/Transforms/Utils/FunctionComparator.h
    llvm/lib/IR/StructuralHash.cpp
    llvm/lib/Transforms/IPO/MergeFunctions.cpp
    llvm/lib/Transforms/Utils/FunctionComparator.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
index 1bdeb85afa3c50..8739c599a7ac33 100644

--- a/llvm/include/llvm/IR/StructuralHash.h
+++ b/llvm/include/llvm/IR/StructuralHash.h
@@ -21,8 +21,10 @@ namespace llvm {
 class Function;
 class Module;
 
-uint64_t StructuralHash(const Function &F);
-uint64_t StructuralHash(const Module &M);
+using IRHash = uint64_t;
+
+IRHash StructuralHash(const Function &F);
+IRHash StructuralHash(const Module &M);
 
 } // end namespace llvm
 

diff  --git a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
index 78761fc78fee8c..c28f868039a1f7 100644
--- a/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/llvm/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -99,11 +99,6 @@ class FunctionComparator {
   /// Test whether the two functions have equivalent behaviour.
   int compare();
 
-  /// Hash a function. Equivalent functions will have the same hash, and unequal
-  /// functions will have 
diff erent hashes with high probability.
-  using FunctionHash = uint64_t;
-  static FunctionHash functionHash(Function &);
-
 protected:
   /// Start the comparison.
   void beginCompare() {

diff  --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
index 6ea108d831a165..5ba64e0cc0d35f 100644
--- a/llvm/lib/IR/StructuralHash.cpp
+++ b/llvm/lib/IR/StructuralHash.cpp
@@ -27,12 +27,28 @@ class StructuralHashImpl {
 public:
   StructuralHashImpl() : Hash(4) {}
 
+  // A function hash is calculated by considering only the number of arguments
+  // and whether a function is varargs, the order of basic blocks (given by the
+  // successors of each basic block in depth first order), and the order of
+  // opcodes of each instruction within each of these basic blocks. This mirrors
+  // the strategy FunctionComparator::compare() uses to compare functions by
+  // walking the BBs in depth first order and comparing each instruction in
+  // sequence. Because this hash currently does not look at the operands, it is
+  // insensitive to things such as the target of calls and the constants used in
+  // the function, which makes it useful when possibly merging functions which
+  // are the same modulo constants and call targets.
+  //
+  // Note that 
diff erent users of StructuralHash will want 
diff erent behavior
+  // out of it (i.e., MergeFunctions will want something 
diff erent from PM
+  // expensive checks for pass modification status). When modifying this
+  // function, most changes should be gated behind an option and enabled
+  // selectively.
   void update(const Function &F) {
     // Declarations don't affect analyses.
     if (F.isDeclaration())
       return;
 
-    hash(12345); // Function header
+    hash(0x6acaa36bef8325c5ULL); // Function header
 
     hash(F.isVarArg());
     hash(F.arg_size());
@@ -40,11 +56,18 @@ class StructuralHashImpl {
     SmallVector<const BasicBlock *, 8> BBs;
     SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
 
+    // Walk the blocks in the same order as
+    // FunctionComparator::cmpBasicBlocks(), accumulating the hash of the
+    // function "structure." (BB and opcode sequence)
     BBs.push_back(&F.getEntryBlock());
     VisitedBBs.insert(BBs[0]);
     while (!BBs.empty()) {
       const BasicBlock *BB = BBs.pop_back_val();
-      hash(45798); // Block header
+
+      // This random value acts as a block header, as otherwise the partition of
+      // opcodes into BBs wouldn't affect the hash, only the order of the
+      // opcodes
+      hash(45798);
       for (auto &Inst : *BB)
         hash(Inst.getOpcode());
 
@@ -79,13 +102,13 @@ class StructuralHashImpl {
 
 } // namespace
 
-uint64_t llvm::StructuralHash(const Function &F) {
+IRHash llvm::StructuralHash(const Function &F) {
   StructuralHashImpl H;
   H.update(F);
   return H.getHash();
 }
 
-uint64_t llvm::StructuralHash(const Module &M) {
+IRHash llvm::StructuralHash(const Module &M) {
   StructuralHashImpl H;
   H.update(M);
   return H.getHash();

diff  --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index feda5d6459cb47..4a64580b3cb329 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -107,6 +107,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/StructuralHash.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
@@ -171,15 +172,14 @@ namespace {
 
 class FunctionNode {
   mutable AssertingVH<Function> F;
-  FunctionComparator::FunctionHash Hash;
+  IRHash Hash;
 
 public:
   // Note the hash is recalculated potentially multiple times, but it is cheap.
-  FunctionNode(Function *F)
-    : F(F), Hash(FunctionComparator::functionHash(*F))  {}
+  FunctionNode(Function *F) : F(F), Hash(StructuralHash(*F)) {}
 
   Function *getFunc() const { return F; }
-  FunctionComparator::FunctionHash getHash() const { return Hash; }
+  IRHash getHash() const { return Hash; }
 
   /// Replace the reference to the function F by the function G, assuming their
   /// implementations are equal.
@@ -390,11 +390,10 @@ bool MergeFunctions::runOnModule(Module &M) {
 
   // All functions in the module, ordered by hash. Functions with a unique
   // hash value are easily eliminated.
-  std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
-    HashedFuncs;
+  std::vector<std::pair<IRHash, Function *>> HashedFuncs;
   for (Function &Func : M) {
     if (isEligibleForMerging(Func)) {
-      HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func});
+      HashedFuncs.push_back({StructuralHash(Func), &Func});
     }
   }
 

diff  --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index 8daeb92130ba3c..b1d74f67377e27 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -958,67 +958,3 @@ int FunctionComparator::compare() {
   }
   return 0;
 }
-
-namespace {
-
-// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
-// hash of a sequence of 64bit ints, but the entire input does not need to be
-// available at once. This interface is necessary for functionHash because it
-// needs to accumulate the hash as the structure of the function is traversed
-// without saving these values to an intermediate buffer. This form of hashing
-// is not often needed, as usually the object to hash is just read from a
-// buffer.
-class HashAccumulator64 {
-  uint64_t Hash;
-
-public:
-  // Initialize to random constant, so the state isn't zero.
-  HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
-
-  void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
-
-  // No finishing is required, because the entire hash value is used.
-  uint64_t getHash() { return Hash; }
-};
-
-} // end anonymous namespace
-
-// A function hash is calculated by considering only the number of arguments and
-// whether a function is varargs, the order of basic blocks (given by the
-// successors of each basic block in depth first order), and the order of
-// opcodes of each instruction within each of these basic blocks. This mirrors
-// the strategy compare() uses to compare functions by walking the BBs in depth
-// first order and comparing each instruction in sequence. Because this hash
-// does not look at the operands, it is insensitive to things such as the
-// target of calls and the constants used in the function, which makes it useful
-// when possibly merging functions which are the same modulo constants and call
-// targets.
-FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
-  HashAccumulator64 H;
-  H.add(F.isVarArg());
-  H.add(F.arg_size());
-
-  SmallVector<const BasicBlock *, 8> BBs;
-  SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
-
-  // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
-  // accumulating the hash of the function "structure." (BB and opcode sequence)
-  BBs.push_back(&F.getEntryBlock());
-  VisitedBBs.insert(BBs[0]);
-  while (!BBs.empty()) {
-    const BasicBlock *BB = BBs.pop_back_val();
-    // This random value acts as a block header, as otherwise the partition of
-    // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
-    H.add(45798);
-    for (const auto &Inst : *BB) {
-      H.add(Inst.getOpcode());
-    }
-    const Instruction *Term = BB->getTerminator();
-    for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
-      if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
-        continue;
-      BBs.push_back(Term->getSuccessor(i));
-    }
-  }
-  return H.getHash();
-}