[llvm] [CGData] Global Merge Functions (PR #112671)

Kyungwoo Lee via llvm-commits llvm-commits at lists.llvm.org
Sun Nov 10 08:02:28 PST 2024


https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/112671

>From 4e2d83d8675799da1a5c6b57d285ecefc0285c71 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 30 Aug 2024 00:09:09 -0700
Subject: [PATCH 1/4] [CGData] Global Merge Functions

---
 llvm/include/llvm/CGData/CodeGenData.h        |  11 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/include/llvm/LinkAllPasses.h             |   1 +
 llvm/include/llvm/Transforms/IPO.h            |   2 +
 .../Transforms/IPO/GlobalMergeFunctions.h     |  77 ++
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   3 +
 llvm/lib/LTO/LTO.cpp                          |   1 +
 llvm/lib/Transforms/IPO/CMakeLists.txt        |   2 +
 .../Transforms/IPO/GlobalMergeFunctions.cpp   | 747 ++++++++++++++++++
 .../ThinLTO/AArch64/cgdata-merge-local.ll     |  62 ++
 .../test/ThinLTO/AArch64/cgdata-merge-read.ll |  82 ++
 .../AArch64/cgdata-merge-two-rounds.ll        |  68 ++
 .../ThinLTO/AArch64/cgdata-merge-write.ll     |  97 +++
 llvm/tools/llvm-lto2/CMakeLists.txt           |   1 +
 llvm/tools/llvm-lto2/llvm-lto2.cpp            |   6 +
 15 files changed, 1161 insertions(+)
 create mode 100644 llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
 create mode 100644 llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-merge-read.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-merge-two-rounds.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 5d7c74725ccef1..da0e412f2a0e03 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -145,6 +145,9 @@ class CodeGenData {
   const OutlinedHashTree *getOutlinedHashTree() {
     return PublishedHashTree.get();
   }
+  const StableFunctionMap *getStableFunctionMap() {
+    return PublishedStableFunctionMap.get();
+  }
 
   /// Returns true if we should write codegen data.
   bool emitCGData() { return EmitCGData; }
@@ -169,10 +172,18 @@ inline bool hasOutlinedHashTree() {
   return CodeGenData::getInstance().hasOutlinedHashTree();
 }
 
+inline bool hasStableFunctionMap() {
+  return CodeGenData::getInstance().hasStableFunctionMap();
+}
+
 inline const OutlinedHashTree *getOutlinedHashTree() {
   return CodeGenData::getInstance().getOutlinedHashTree();
 }
 
+inline const StableFunctionMap *getStableFunctionMap() {
+  return CodeGenData::getInstance().getStableFunctionMap();
+}
+
 inline bool emitCGData() { return CodeGenData::getInstance().emitCGData(); }
 
 inline void
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 54c070401ec8a4..a43a96e90543cc 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -123,6 +123,7 @@ void initializeGCEmptyBasicBlocksPass(PassRegistry &);
 void initializeGCMachineCodeAnalysisPass(PassRegistry &);
 void initializeGCModuleInfoPass(PassRegistry &);
 void initializeGVNLegacyPassPass(PassRegistry &);
+void initializeGlobalMergeFuncPass(PassRegistry &);
 void initializeGlobalMergePass(PassRegistry &);
 void initializeGlobalsAAWrapperPassPass(PassRegistry &);
 void initializeHardwareLoopsLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 3516b47d29ef36..d1219d6ee2a13e 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -79,6 +79,7 @@ struct ForcePassLinking {
     (void)llvm::createDomOnlyViewerWrapperPassPass();
     (void)llvm::createDomViewerWrapperPassPass();
     (void)llvm::createAlwaysInlinerLegacyPass();
+    (void)llvm::createGlobalMergeFuncPass();
     (void)llvm::createGlobalsAAWrapperPass();
     (void)llvm::createInstSimplifyLegacyPass();
     (void)llvm::createInstructionCombiningPass();
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index ee0e35aa618325..86a8654f56997c 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -55,6 +55,8 @@ enum class PassSummaryAction {
   Export, ///< Export information to summary.
 };
 
+Pass *createGlobalMergeFuncPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
new file mode 100644
index 00000000000000..565be54f89e882
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
@@ -0,0 +1,77 @@
+//===------ GlobalMergeFunctions.h - Global merge functions -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file defines global merge functions pass and related data structure.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
+#define PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CGData/StableFunctionMap.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include <map>
+#include <mutex>
+
+enum class HashFunctionMode {
+  Local,
+  BuildingHashFuncion,
+  UsingHashFunction,
+};
+
+namespace llvm {
+
+// A vector of locations (the pair of (instruction, operand) indices) reachable
+// from a parameter.
+using ParamLocs = SmallVector<IndexPair, 4>;
+// A vector of parameters
+using ParamLocsVecTy = SmallVector<ParamLocs, 8>;
+// A map of stable hash to a vector of stable functions
+
+/// GlobalMergeFunc finds functions which only differ by constants in
+/// certain instructions, e.g. resulting from specialized functions of layout
+/// compatible types.
+/// Unlike PikaMergeFunc that directly compares IRs, this uses stable function
+/// hash to find the merge candidate. Similar to the global outliner, we can run
+/// codegen twice to collect function merge candidate in the first round, and
+/// merge functions globally in the second round.
+class GlobalMergeFunc : public ModulePass {
+  HashFunctionMode MergerMode = HashFunctionMode::Local;
+
+  std::unique_ptr<StableFunctionMap> LocalFunctionMap;
+
+public:
+  static char ID;
+
+  GlobalMergeFunc();
+
+  StringRef getPassName() const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void initializeMergerMode(const Module &M);
+
+  bool runOnModule(Module &M) override;
+
+  /// Analyze module to create stable function into LocalFunctionMap.
+  void analyze(Module &M);
+
+  /// Emit LocalFunctionMap into __llvm_merge section.
+  void emitFunctionMap(Module &M);
+
+  /// Merge functions in the module using the global function map.
+  bool merge(Module &M, const StableFunctionMap *FunctionMap);
+};
+
+} // end namespace llvm
+#endif // PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index aff74104006e5a..beeb8ea2d078d9 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -141,6 +141,9 @@ static cl::opt<RunOutliner> EnableMachineOutliner(
                           "Disable all outlining"),
                // Sentinel value for unspecified option.
                clEnumValN(RunOutliner::AlwaysOutline, "", "")));
+cl::opt<bool> EnableGlobalMergeFunc(
+    "enable-global-merge-func", cl::Hidden,
+    cl::desc("Enable global merge functions that are based on hash function"));
 // Disable the pass to fix unwind information. Whether the pass is included in
 // the pipeline is controlled via the target options, this option serves as
 // manual override.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 0f53c608512171..3f88c6c475d4d8 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/GlobalMergeFunctions.h"
 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 15cb57399d2460..6a36ed64149f93 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMipo
   GlobalDCE.cpp
   GlobalOpt.cpp
   GlobalSplit.cpp
+  GlobalMergeFunctions.cpp
   HotColdSplitting.cpp
   IPO.cpp
   IROutliner.cpp
@@ -60,6 +61,7 @@ add_llvm_component_library(LLVMipo
   Analysis
   BitReader
   BitWriter
+  CGData
   Core
   FrontendOpenMP
   InstCombine
diff --git a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
new file mode 100644
index 00000000000000..6fd1a66b5a197c
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
@@ -0,0 +1,747 @@
+//===---- GlobalMergeFunctions.cpp - Global merge functions -------*- C++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO:  This implements a function merge using function hash while tracking
+// differences in Constants. This uses stable function hash to find potential
+// merge candidates. The first codegen round collects stable function hashes,
+// and determines the merge candidates that match the stable function hashes.
+// The set of parameters pointing to different Constants are also computed
+// during the stable function merge. The second codegen round uses this global
+// function info to optimistically create a merged function in each module
+// context to guarantee correct transformation. Similar to the global outliner,
+// the linker's deduplication (ICF) folds the identical merged functions to save
+// the final binary size.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalMergeFunctions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/CGData/CodeGenData.h"
+#include "llvm/CGData/StableFunctionMap.h"
+#include "llvm/CodeGen/MachineStableHash.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/StructuralHash.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#define DEBUG_TYPE "global-merge-func"
+
+using namespace llvm;
+using namespace llvm::support;
+
+static cl::opt<bool>
+    DisableGlobalMerging("disable-global-merging", cl::Hidden,
+                         cl::desc("Disable global merging only by ignoring "
+                                  "the codegen data generation or use. Local "
+                                  "merging is still enabled within a module."),
+                         cl::init(false));
+static cl::opt<unsigned> GlobalMergingMinInstrs(
+    "global-merging-min-instrs",
+    cl::desc("The minimum instruction count required when merging functions."),
+    cl::init(1), cl::Hidden);
+static cl::opt<unsigned> GlobalMergingMaxParams(
+    "global-merging-max-params",
+    cl::desc(
+        "The maximum number of parameters allowed when merging functions."),
+    cl::init(std::numeric_limits<unsigned>::max()), cl::Hidden);
+static cl::opt<unsigned> GlobalMergingParamOverhead(
+    "global-merging-param-overhead",
+    cl::desc("The overhead cost associated with each parameter when merging "
+             "functions."),
+    cl::init(2), cl::Hidden);
+static cl::opt<unsigned>
+    GlobalMergingCallOverhead("global-merging-call-overhead",
+                              cl::desc("The overhead cost associated with each "
+                                       "function call when merging functions."),
+                              cl::init(1), cl::Hidden);
+static cl::opt<unsigned> GlobalMergingExtraThreshold(
+    "global-merging-extra-threshold",
+    cl::desc("An additional cost threshold that must be exceeded for merging "
+             "to be considered beneficial."),
+    cl::init(0), cl::Hidden);
+
+extern cl::opt<bool> EnableGlobalMergeFunc;
+
+STATISTIC(NumMismatchedFunctionHashGlobalMergeFunction,
+          "Number of mismatched function hash for global merge function");
+STATISTIC(NumMismatchedInstCountGlobalMergeFunction,
+          "Number of mismatched instruction count for global merge function");
+STATISTIC(NumMismatchedConstHashGlobalMergeFunction,
+          "Number of mismatched const hash for global merge function");
+STATISTIC(NumMismatchedModuleIdGlobalMergeFunction,
+          "Number of mismatched Module Id for global merge function");
+STATISTIC(NumGlobalMergeFunctions,
+          "Number of functions that are actually merged using function hash");
+STATISTIC(NumAnalyzedModues, "Number of modules that are analyzed");
+STATISTIC(NumAnalyzedFunctions, "Number of functions that are analyzed");
+STATISTIC(NumEligibleFunctions, "Number of functions that are eligible");
+
+/// Returns true if the \opIdx operand of \p CI is the callee operand.
+static bool isCalleeOperand(const CallBase *CI, unsigned OpIdx) {
+  return &CI->getCalledOperandUse() == &CI->getOperandUse(OpIdx);
+}
+
+static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) {
+  if (CI->isInlineAsm())
+    return false;
+  Function *Callee = CI->getCalledOperand()
+                         ? dyn_cast_or_null<Function>(
+                               CI->getCalledOperand()->stripPointerCasts())
+                         : nullptr;
+  if (Callee) {
+    if (Callee->isIntrinsic())
+      return false;
+    // objc_msgSend stubs must be called, and can't have their address taken.
+    if (Callee->getName().starts_with("objc_msgSend$"))
+      return false;
+  }
+  if (isCalleeOperand(CI, OpIdx) &&
+      CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) {
+    // The operand is the callee and it has already been signed. Ignore this
+    // because we cannot add another ptrauth bundle to the call instruction.
+    return false;
+  }
+  return true;
+}
+
+bool isEligibleInstrunctionForConstantSharing(const Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::Call:
+  case Instruction::Invoke:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx) {
+  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
+
+  if (!isEligibleInstrunctionForConstantSharing(I))
+    return false;
+
+  auto Opnd = I->getOperand(OpIdx);
+  if (!isa<Constant>(Opnd))
+    return false;
+
+  if (const auto *CI = dyn_cast<CallBase>(I))
+    return canParameterizeCallOperand(CI, OpIdx);
+
+  return true;
+}
+
+/// Returns true if function \p F is eligible for merging.
+bool isEligibleFunction(Function *F) {
+  if (F->isDeclaration())
+    return false;
+
+  if (F->hasFnAttribute(llvm::Attribute::NoMerge))
+    return false;
+
+  if (F->hasAvailableExternallyLinkage()) {
+    return false;
+  }
+
+  if (F->getFunctionType()->isVarArg()) {
+    return false;
+  }
+
+  if (F->getCallingConv() == CallingConv::SwiftTail)
+    return false;
+
+  // if function contains callsites with musttail, if we merge
+  // it, the merged function will have the musttail callsite, but
+  // the number of parameters can change, thus the parameter count
+  // of the callsite will mismatch with the function itself.
+  // if (IgnoreMusttailFunction) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      const auto *CB = dyn_cast<CallBase>(&I);
+      if (CB && CB->isMustTailCall())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+static bool
+isEligibleInstrunctionForConstantSharingLocal(const Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::Call:
+  case Instruction::Invoke:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool ignoreOp(const Instruction *I, unsigned OpIdx) {
+  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
+
+  if (!isEligibleInstrunctionForConstantSharingLocal(I))
+    return false;
+
+  if (!isa<Constant>(I->getOperand(OpIdx)))
+    return false;
+
+  if (const auto *CI = dyn_cast<CallBase>(I))
+    return canParameterizeCallOperand(CI, OpIdx);
+
+  return true;
+}
+
+// copy from merge functions.cpp
+static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isStructTy()) {
+    assert(DestTy->isStructTy());
+    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
+    Value *Result = PoisonValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
+      Value *Element =
+          createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
+                     DestTy->getStructElementType(I));
+
+      Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isStructTy());
+  if (auto *SrcAT = dyn_cast<ArrayType>(SrcTy)) {
+    auto *DestAT = dyn_cast<ArrayType>(DestTy);
+    assert(DestAT);
+    assert(SrcAT->getNumElements() == DestAT->getNumElements());
+    Value *Result = UndefValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcAT->getNumElements(); I < E; ++I) {
+      Value *Element =
+          createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
+                     DestAT->getElementType());
+
+      Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isArrayTy());
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
+void GlobalMergeFunc::analyze(Module &M) {
+  ++NumAnalyzedModues;
+  for (Function &Func : M) {
+    ++NumAnalyzedFunctions;
+    if (isEligibleFunction(&Func)) {
+      ++NumEligibleFunctions;
+
+      auto FI = llvm::StructuralHashWithDifferences(Func, ignoreOp);
+
+      // Convert the map to a vector for a serialization-friendly format.
+      IndexOperandHashVecType IndexOperandHashes;
+      for (auto &Pair : *FI.IndexOperandHashMap)
+        IndexOperandHashes.emplace_back(Pair);
+
+      StableFunction SF(FI.FunctionHash, get_stable_name(Func.getName()).str(),
+                        M.getModuleIdentifier(), FI.IndexInstruction->size(),
+                        std::move(IndexOperandHashes));
+
+      LocalFunctionMap->insert(SF);
+    }
+  }
+}
+
+/// Tuple to hold function info to process merging.
+struct FuncMergeInfo {
+  StableFunctionMap::StableFunctionEntry *SF;
+  Function *F;
+  std::unique_ptr<IndexInstrMap> IndexInstruction;
+};
+
+// Given the func info, and the parameterized locations, create and return
+// a new merged function by replacing the original constants with the new
+// parameters.
+static Function *createMergedFunction(FuncMergeInfo &FI,
+                                      ArrayRef<Type *> ConstParamTypes,
+                                      const ParamLocsVecTy &ParamLocsVec) {
+  // Synthesize a new merged function name by appending ".Tgm" to the root
+  // function's name.
+  auto *MergedFunc = FI.F;
+  auto NewFunctionName = MergedFunc->getName().str() + ".Tgm";
+  auto *M = MergedFunc->getParent();
+  assert(!M->getFunction(NewFunctionName));
+
+  FunctionType *OrigTy = MergedFunc->getFunctionType();
+  // Get the original params' types.
+  SmallVector<Type *> ParamTypes(OrigTy->param_begin(), OrigTy->param_end());
+  // Append const parameter types that are passed in.
+  ParamTypes.append(ConstParamTypes.begin(), ConstParamTypes.end());
+  FunctionType *FuncType =
+      FunctionType::get(OrigTy->getReturnType(), ParamTypes, false);
+
+  // Declare a new function
+  Function *NewFunction =
+      Function::Create(FuncType, MergedFunc->getLinkage(), NewFunctionName);
+  if (auto *SP = MergedFunc->getSubprogram())
+    NewFunction->setSubprogram(SP);
+  NewFunction->copyAttributesFrom(MergedFunc);
+  NewFunction->setDLLStorageClass(GlobalValue::DefaultStorageClass);
+
+  NewFunction->setLinkage(GlobalValue::InternalLinkage);
+  NewFunction->addFnAttr(Attribute::NoInline);
+
+  // Add the new function before the root function.
+  M->getFunctionList().insert(MergedFunc->getIterator(), NewFunction);
+
+  // Move the body of MergedFunc into the NewFunction.
+  NewFunction->splice(NewFunction->begin(), MergedFunc);
+
+  // Update the original args by the new args.
+  auto NewArgIter = NewFunction->arg_begin();
+  for (Argument &OrigArg : MergedFunc->args()) {
+    Argument &NewArg = *NewArgIter++;
+    OrigArg.replaceAllUsesWith(&NewArg);
+  }
+
+  // Replace the original Constants by the new args.
+  unsigned NumOrigArgs = MergedFunc->arg_size();
+  for (unsigned ParamIdx = 0; ParamIdx < ParamLocsVec.size(); ++ParamIdx) {
+    Argument *NewArg = NewFunction->getArg(NumOrigArgs + ParamIdx);
+    for (auto [InstIndex, OpndIndex] : ParamLocsVec[ParamIdx]) {
+      auto *Inst = FI.IndexInstruction->lookup(InstIndex);
+      auto *OrigC = Inst->getOperand(OpndIndex);
+      if (OrigC->getType() != NewArg->getType()) {
+        IRBuilder<> Builder(Inst->getParent(), Inst->getIterator());
+        Inst->setOperand(OpndIndex,
+                         createCast(Builder, NewArg, OrigC->getType()));
+      } else
+        Inst->setOperand(OpndIndex, NewArg);
+    }
+  }
+
+  return NewFunction;
+}
+
+// Given the original function (Thunk) and the merged function (ToFunc), create
+// a thunk to the merged function.
+
+static void createThunk(FuncMergeInfo &FI, ArrayRef<Constant *> Params,
+                        Function *ToFunc) {
+  auto *Thunk = FI.F;
+
+  assert(Thunk->arg_size() + Params.size() ==
+         ToFunc->getFunctionType()->getNumParams());
+  Thunk->dropAllReferences();
+
+  BasicBlock *BB = BasicBlock::Create(Thunk->getContext(), "", Thunk);
+  IRBuilder<> Builder(BB);
+
+  SmallVector<Value *> Args;
+  unsigned ParamIdx = 0;
+  FunctionType *ToFuncTy = ToFunc->getFunctionType();
+
+  // Add arguments which are passed through Thunk.
+  for (Argument &AI : Thunk->args()) {
+    Args.push_back(createCast(Builder, &AI, ToFuncTy->getParamType(ParamIdx)));
+    ++ParamIdx;
+  }
+
+  // Add new arguments defined by Params.
+  for (auto *Param : Params) {
+    assert(ParamIdx < ToFuncTy->getNumParams());
+    // FIXME: do not support signing
+    Args.push_back(
+        createCast(Builder, Param, ToFuncTy->getParamType(ParamIdx)));
+    ++ParamIdx;
+  }
+
+  CallInst *CI = Builder.CreateCall(ToFunc, Args);
+  bool isSwiftTailCall = ToFunc->getCallingConv() == CallingConv::SwiftTail &&
+                         Thunk->getCallingConv() == CallingConv::SwiftTail;
+  CI->setTailCallKind(isSwiftTailCall ? llvm::CallInst::TCK_MustTail
+                                      : llvm::CallInst::TCK_Tail);
+  CI->setCallingConv(ToFunc->getCallingConv());
+  CI->setAttributes(ToFunc->getAttributes());
+  if (Thunk->getReturnType()->isVoidTy())
+    Builder.CreateRetVoid();
+  else
+    Builder.CreateRet(createCast(Builder, CI, Thunk->getReturnType()));
+}
+
+// Check if the old merged/optimized IndexOperandHashMap is compatible with
+// the current IndexOperandHashMap. An operand hash may not be stable across
+// different builds due to varying modules combined. To address this, we relax
+// the hash check condition by comparing Const hash patterns instead of absolute
+// hash values. For example, let's assume we have three Consts located at idx1,
+// idx3, and idx6, where their corresponding hashes are hash1, hash2, and hash1
+// in the old merged map below:
+//   Old (Merged): [(idx1, hash1), (idx3, hash2), (idx6, hash1)]
+//   Current: [(idx1, hash1'), (idx3, hash2'), (idx6, hash1')]
+// If the current function also has three Consts in the same locations,
+// with hash sequences hash1', hash2', and hash1' where the first and third
+// are the same as the old hash sequences, we consider them matched.
+static bool checkConstHashCompatible(
+    const DenseMap<IndexPair, stable_hash> &OldInstOpndIndexToConstHash,
+    const DenseMap<IndexPair, stable_hash> &CurrInstOpndIndexToConstHash) {
+
+  DenseMap<stable_hash, stable_hash> OldHashToCurrHash;
+  for (const auto &[Index, OldHash] : OldInstOpndIndexToConstHash) {
+    auto It = CurrInstOpndIndexToConstHash.find(Index);
+    if (It == CurrInstOpndIndexToConstHash.end())
+      return false;
+
+    auto CurrHash = It->second;
+    auto J = OldHashToCurrHash.find(OldHash);
+    if (J == OldHashToCurrHash.end())
+      OldHashToCurrHash.insert({OldHash, CurrHash});
+    else if (J->second != CurrHash)
+      return false;
+  }
+
+  return true;
+}
+
+// Validate the locations pointed by a param has the same hash and Constant.
+static bool
+checkConstLocationCompatible(const StableFunctionMap::StableFunctionEntry &SF,
+                             const IndexInstrMap &IndexInstruction,
+                             const ParamLocsVecTy &ParamLocsVec) {
+  for (auto &ParamLocs : ParamLocsVec) {
+    std::optional<stable_hash> OldHash;
+    std::optional<Constant *> OldConst;
+    for (auto &Loc : ParamLocs) {
+      assert(SF.IndexOperandHashMap->count(Loc));
+      auto CurrHash = SF.IndexOperandHashMap.get()->at(Loc);
+      auto [InstIndex, OpndIndex] = Loc;
+      assert(InstIndex < IndexInstruction.size());
+      const auto *Inst = IndexInstruction.lookup(InstIndex);
+      auto *CurrConst = cast<Constant>(Inst->getOperand(OpndIndex));
+      if (!OldHash) {
+        OldHash = CurrHash;
+        OldConst = CurrConst;
+      } else if (CurrConst != *OldConst || CurrHash != *OldHash)
+        return false;
+    }
+  }
+  return true;
+}
+
+static ParamLocsVecTy computeParamInfo(
+    const SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>>
+        &SFS) {
+  std::map<std::vector<stable_hash>, ParamLocs> HashSeqToLocs;
+  auto &RSF = *SFS[0];
+  unsigned StableFunctionCount = SFS.size();
+
+  for (auto &[IndexPair, Hash] : *RSF.IndexOperandHashMap) {
+    // Const hash sequence across stable functions.
+    // We will allocate a parameter per unique hash squence.
+    // can't use SmallVector as key
+    std::vector<stable_hash> ConstHashSeq;
+    ConstHashSeq.push_back(Hash);
+    bool Identical = true;
+    for (unsigned J = 1; J < StableFunctionCount; ++J) {
+      auto &SF = SFS[J];
+      assert(SF->IndexOperandHashMap->count(IndexPair));
+      auto SHash = (*SF->IndexOperandHashMap)[IndexPair];
+      if (Hash != SHash)
+        Identical = false;
+      ConstHashSeq.push_back(SHash);
+    }
+
+    if (Identical)
+      continue;
+
+    // For each unique Const hash sequence (parameter), add the locations.
+    HashSeqToLocs[ConstHashSeq].push_back(IndexPair);
+  }
+
+  ParamLocsVecTy ParamLocsVec;
+  for (auto &[HashSeq, Locs] : HashSeqToLocs) {
+    ParamLocsVec.push_back(std::move(Locs));
+    std::sort(
+        ParamLocsVec.begin(), ParamLocsVec.end(),
+        [&](const ParamLocs &L, const ParamLocs &R) { return L[0] < R[0]; });
+  }
+  return ParamLocsVec;
+}
+
+static bool isProfitable(
+    const SmallVector<std::unique_ptr<StableFunctionMap::StableFunctionEntry>>
+        &SFS,
+    const Function *F) {
+  // No interest if the number of candidates are less than 2.
+  unsigned StableFunctionCount = SFS.size();
+  if (StableFunctionCount < 2)
+    return false;
+
+  unsigned InstCount = SFS[0]->InstCount;
+  if (InstCount < GlobalMergingMinInstrs)
+    return false;
+
+  unsigned ParamCount = SFS[0]->IndexOperandHashMap->size();
+  unsigned TotalParamCount = ParamCount + F->getFunctionType()->getNumParams();
+  if (TotalParamCount > GlobalMergingMaxParams)
+    return false;
+
+  unsigned Benefit = InstCount * (StableFunctionCount - 1);
+  unsigned Cost =
+      (GlobalMergingParamOverhead * ParamCount + GlobalMergingCallOverhead) *
+          StableFunctionCount +
+      GlobalMergingExtraThreshold;
+
+  bool Result = Benefit > Cost;
+  LLVM_DEBUG(dbgs() << "isProfitable: Function = " << F->getName() << ", "
+                    << "StableFunctionCount = " << StableFunctionCount
+                    << ", InstCount = " << InstCount
+                    << ", ParamCount = " << ParamCount
+                    << ", Benefit = " << Benefit << ", Cost = " << Cost
+                    << ", Result = " << (Result ? "true" : "false") << "\n");
+  return Result;
+}
+
+bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
+  bool Changed = false;
+
+  // Build a map from stable function name to function.
+  StringMap<Function *> StableNameToFuncMap;
+  for (auto &F : M)
+    StableNameToFuncMap[get_stable_name(F.getName())] = &F;
+  // Track merged functions
+  DenseSet<Function *> MergedFunctions;
+
+  auto ModId = M.getModuleIdentifier();
+  for (auto &[Hash, SFS] : FunctionMap->getFunctionMap()) {
+    // Compute the parameter locations based on the unique hash sequences
+    // across the candidates.
+    auto ParamLocsVec = computeParamInfo(SFS);
+    LLVM_DEBUG({
+      dbgs() << "[GlobalMergeFunc] Merging hash: " << Hash << " with Params "
+             << ParamLocsVec.size() << "\n";
+    });
+
+    Function *MergedFunc = nullptr;
+    std::string MergedModId;
+    SmallVector<FuncMergeInfo> FuncMergeInfos;
+    for (auto &SF : SFS) {
+      // Get the function from the stable name.
+      auto I = StableNameToFuncMap.find(
+          *FunctionMap->getNameForId(SF->FunctionNameId));
+      if (I == StableNameToFuncMap.end())
+        continue;
+      Function *F = I->second;
+      assert(F);
+      // Skip if the function has been merged before.
+      if (MergedFunctions.count(F))
+        continue;
+      // Consider the function if it is eligible for merging.
+      if (!isEligibleFunction(F))
+        continue;
+
+      auto FI = llvm::StructuralHashWithDifferences(*F, ignoreOp);
+      uint64_t FuncHash = FI.FunctionHash;
+      if (Hash != FuncHash) {
+        ++NumMismatchedFunctionHashGlobalMergeFunction;
+        continue;
+      }
+
+      if (SF->InstCount != FI.IndexInstruction->size()) {
+        ++NumMismatchedInstCountGlobalMergeFunction;
+        continue;
+      }
+      bool HasValidSharedConst = true;
+      for (auto &[Index, Hash] : *SF->IndexOperandHashMap) {
+        auto [InstIndex, OpndIndex] = Index;
+        assert(InstIndex < FI.IndexInstruction->size());
+        auto *Inst = FI.IndexInstruction->lookup(InstIndex);
+        if (!isEligibleOperandForConstantSharing(Inst, OpndIndex)) {
+          HasValidSharedConst = false;
+          break;
+        }
+      }
+      if (!HasValidSharedConst) {
+        ++NumMismatchedConstHashGlobalMergeFunction;
+        continue;
+      }
+      if (!checkConstHashCompatible(*SF->IndexOperandHashMap,
+                                    *FI.IndexOperandHashMap)) {
+        ++NumMismatchedConstHashGlobalMergeFunction;
+        continue;
+      }
+      if (!checkConstLocationCompatible(*SF, *FI.IndexInstruction,
+                                        ParamLocsVec)) {
+        ++NumMismatchedConstHashGlobalMergeFunction;
+        continue;
+      }
+
+      if (!isProfitable(SFS, F))
+        break;
+
+      if (MergedFunc) {
+        // Check if the matched functions fall into the same (first) module.
+        // This module check is not strictly necessary as the functions can move
+        // around. We just want to avoid merging functions from different
+        // modules than the first one in the functon map, as they may not end up
+        // with not being ICFed.
+        if (MergedModId != *FunctionMap->getNameForId(SF->ModuleNameId)) {
+          ++NumMismatchedModuleIdGlobalMergeFunction;
+          continue;
+        }
+      } else {
+        MergedFunc = F;
+        MergedModId = *FunctionMap->getNameForId(SF->ModuleNameId);
+      }
+
+      FuncMergeInfos.push_back({SF.get(), F, std::move(FI.IndexInstruction)});
+      MergedFunctions.insert(F);
+    }
+    unsigned FuncMergeInfoSize = FuncMergeInfos.size();
+    if (FuncMergeInfoSize == 0)
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "[GlobalMergeFunc] Merging function count " << FuncMergeInfoSize
+             << " in  " << ModId << "\n";
+    });
+    for (auto &FMI : FuncMergeInfos) {
+      Changed = true;
+
+      // We've already validated all locations of constant operands pointed by
+      // the parameters. Just use the first one to bookkeep the original
+      // constants for each parameter
+      SmallVector<Constant *> Params;
+      SmallVector<Type *> ParamTypes;
+      for (auto &ParamLocs : ParamLocsVec) {
+        assert(!ParamLocs.empty());
+        auto &[InstIndex, OpndIndex] = ParamLocs[0];
+        auto *Inst = FMI.IndexInstruction->lookup(InstIndex);
+        auto *Opnd = cast<Constant>(Inst->getOperand(OpndIndex));
+        Params.push_back(Opnd);
+        ParamTypes.push_back(Opnd->getType());
+      }
+
+      // Create a merged function derived from the first function in the current
+      // module context.
+      Function *MergedFunc =
+          createMergedFunction(FMI, ParamTypes, ParamLocsVec);
+
+      LLVM_DEBUG({
+        dbgs() << "[GlobalMergeFunc] Merged function (hash:" << FMI.SF->Hash
+               << ") " << MergedFunc->getName() << " generated from "
+               << FMI.F->getName() << ":\n";
+        MergedFunc->dump();
+      });
+
+      // Create a thunk to the merged function.
+      createThunk(FMI, Params, MergedFunc);
+      LLVM_DEBUG({
+        dbgs() << "[GlobalMergeFunc] Thunk generated: \n";
+        FMI.F->dump();
+      });
+      ++NumGlobalMergeFunctions;
+    }
+  }
+
+  return Changed;
+}
+
+char GlobalMergeFunc::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalMergeFunc, "global-merge-func",
+                      "Global merge function pass", false, false)
+INITIALIZE_PASS_END(GlobalMergeFunc, "global-merge-func",
+                    "Global merge function pass", false, false)
+
+StringRef GlobalMergeFunc::getPassName() const {
+  return "Global Merge Functions";
+}
+
+void GlobalMergeFunc::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addUsedIfAvailable<ImmutableModuleSummaryIndexWrapperPass>();
+  AU.setPreservesAll();
+  ModulePass::getAnalysisUsage(AU);
+}
+
+GlobalMergeFunc::GlobalMergeFunc() : ModulePass(ID) {
+  initializeGlobalMergeFuncPass(*llvm::PassRegistry::getPassRegistry());
+}
+
+namespace llvm {
+Pass *createGlobalMergeFuncPass() { return new GlobalMergeFunc(); }
+} // namespace llvm
+
+void GlobalMergeFunc::initializeMergerMode(const Module &M) {
+  LocalFunctionMap = std::make_unique<StableFunctionMap>();
+
+  if (DisableGlobalMerging)
+    return;
+
+  if (auto *IndexWrapperPass =
+          getAnalysisIfAvailable<ImmutableModuleSummaryIndexWrapperPass>()) {
+    auto *TheIndex = IndexWrapperPass->getIndex();
+    // (Full)LTO module does not have functions added to the index.
+    // In this case, we run a local merger without using codegen data.
+    if (TheIndex && !TheIndex->hasExportedFunctions(M))
+      return;
+  }
+
+  if (cgdata::emitCGData())
+    MergerMode = HashFunctionMode::BuildingHashFuncion;
+  else if (cgdata::hasStableFunctionMap())
+    MergerMode = HashFunctionMode::UsingHashFunction;
+}
+
+void GlobalMergeFunc::emitFunctionMap(Module &M) {
+  LLVM_DEBUG({
+    dbgs() << "Emit function map. Size: " << LocalFunctionMap->size() << "\n";
+  });
+  // No need to emit the function map if it is empty.
+  if (LocalFunctionMap->empty())
+    return;
+  SmallVector<char> Buf;
+  raw_svector_ostream OS(Buf);
+
+  StableFunctionMapRecord::serialize(OS, LocalFunctionMap.get());
+
+  std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+      OS.str(), "in-memory stable function map", false);
+
+  Triple TT(M.getTargetTriple());
+  embedBufferInModule(M, *Buffer.get(),
+                      getCodeGenDataSectionName(CG_merge, TT.getObjectFormat()),
+                      Align(4));
+}
+
+bool GlobalMergeFunc::runOnModule(Module &M) {
+  initializeMergerMode(M);
+
+  const StableFunctionMap *FuncMap;
+  if (MergerMode == HashFunctionMode::UsingHashFunction) {
+    // Use the prior CG data to optimistically create global merge candidates.
+    FuncMap = cgdata::getStableFunctionMap();
+  } else {
+    analyze(M);
+    // Emit the local function map to the custom section, __llvm_merge before
+    // finalizing it.
+    if (MergerMode == HashFunctionMode::BuildingHashFuncion)
+      emitFunctionMap(M);
+    LocalFunctionMap->finalize();
+    FuncMap = LocalFunctionMap.get();
+  }
+
+  return merge(M, FuncMap);
+}
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll b/llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll
new file mode 100644
index 00000000000000..4f5f5c0b5b26d9
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll
@@ -0,0 +1,62 @@
+; This test checks if two similar functions, f1 and f2, can be merged locally within a single module
+; while parameterizing a difference in their global variables, g1 and g2.
+; To achieve this, we create two instances of the global merging function, f1.Tgm and f2.Tgm,
+; which are tail-called from thunks g1 and g2 respectively.
+; These identical functions, f1.Tgm and f2.Tgm, will be folded by the linker via Identical Code Folding (IFC).
+
+; RUN: opt -module-summary -module-hash %s -o %t
+
+; RUN: llvm-lto2 run -enable-global-merge-func=false %t -o %tout-nomerge \
+; RUN:    -r %t,_f1,px \
+; RUN:    -r %t,_f2,px \
+; RUN:    -r %t,_g,l -r %t,_g1,l -r %t,_g2,l
+; RUN: llvm-nm %tout-nomerge.1 | FileCheck %s --check-prefix=NOMERGE
+; RUN: llvm-lto2 run -enable-global-merge-func=true %t -o %tout-merge \
+; RUN:    -r %t,_f1,px \
+; RUN:    -r %t,_f2,px \
+; RUN:    -r %t,_g,l -r %t,_g1,l -r %t,_g2,l
+; RUN: llvm-nm %tout-merge.1 | FileCheck %s --check-prefix=GLOBALMERGE
+; RUN: llvm-objdump -d %tout-merge.1 | FileCheck %s --check-prefix=THUNK
+
+; NOMERGE-NOT: _f1.Tgm
+; GLOBALMERGE: _f1.Tgm
+; GLOBALMERGE: _f2.Tgm
+
+; THUNK: <_f1>:
+; THUNK-NEXT: adrp x1,
+; THUNK-NEXT: ldr x1, [x1]
+; THUNK-NEXT: b
+
+; THUNK: <_f2>:
+; THUNK-NEXT: adrp x1,
+; THUNK-NEXT: ldr x1, [x1]
+; THUNK-NEXT: b
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-ios12.0.0"
+
+ at g = external local_unnamed_addr global [0 x i32], align 4
+ at g1 = external global i32, align 4
+ at g2 = external global i32, align 4
+
+define i32 @f1(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
+
+define i32 @f2(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-merge-read.ll b/llvm/test/ThinLTO/AArch64/cgdata-merge-read.ll
new file mode 100644
index 00000000000000..da756e7f15e68e
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-merge-read.ll
@@ -0,0 +1,82 @@
+; This test demonstrates how similar functions are handled during global outlining.
+; Currently, we do not attempt to share an merged function for identical sequences.
+; Instead, each merging instance is created uniquely.
+
+; RUN: rm -rf %t; split-file %s %t
+
+; RUN: opt -module-summary -module-hash %t/foo.ll -o %t-foo.bc
+; RUN: opt -module-summary -module-hash %t/goo.ll -o %t-goo.bc
+
+; First, run with -codegen-data-generate=true to generate the cgdata in the object files.
+; Using llvm-cgdata, merge the cg data.
+; RUN: llvm-lto2 run -enable-global-merge-func=true -codegen-data-generate=true %t-foo.bc %t-goo.bc -o %tout-write \
+; RUN:    -r %t-foo.bc,_f1,px \
+; RUN:    -r %t-goo.bc,_f2,px \
+; RUN:    -r %t-foo.bc,_g,l -r %t-foo.bc,_g1,l -r %t-foo.bc,_g2,l \
+; RUN:    -r %t-goo.bc,_g,l -r %t-goo.bc,_g1,l -r %t-goo.bc,_g2,l
+; RUN: llvm-cgdata --merge -o %tout.cgdata %tout-write.1 %tout-write.2
+
+; Now run with -codegen-data-use-path=%tout.cgdata to optimize the binary.
+; Each module has its own merging instance as it is matched against the merged cgdata.
+; RUN: llvm-lto2 run -enable-global-merge-func=true \
+; RUN:    -codegen-data-use-path=%tout.cgdata \
+; RUN:    %t-foo.bc %t-goo.bc -o %tout-read \
+; RUN:    -r %t-foo.bc,_f1,px \
+; RUN:    -r %t-goo.bc,_f2,px \
+; RUN:    -r %t-foo.bc,_g,l -r %t-foo.bc,_g1,l -r %t-foo.bc,_g2,l \
+; RUN:    -r %t-goo.bc,_g,l -r %t-goo.bc,_g1,l -r %t-goo.bc,_g2,l
+; RUN: llvm-nm %tout-read.1 | FileCheck %s --check-prefix=READ1
+; RUN: llvm-nm %tout-read.2 | FileCheck %s --check-prefix=READ2
+; RUN: llvm-objdump -d %tout-read.1 | FileCheck %s --check-prefix=THUNK1
+; RUN: llvm-objdump -d %tout-read.2 | FileCheck %s --check-prefix=THUNK2
+
+; READ1: _f1.Tgm
+; READ2: _f2.Tgm
+
+; THUNK1: <_f1>:
+; THUNK1-NEXT: adrp x1,
+; THUNK1-NEXT: ldr x1, [x1]
+; THUNK1-NEXT: b
+
+; THUNK2: <_f2>:
+; THUNK2-NEXT: adrp x1,
+; THUNK2-NEXT: ldr x1, [x1]
+; THUNK2-NEXT: b
+
+;--- foo.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-ios12.0.0"
+
+ at g = external local_unnamed_addr global [0 x i32], align 4
+ at g1 = external global i32, align 4
+ at g2 = external global i32, align 4
+
+define i32 @f1(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
+
+;--- goo.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-ios12.0.0"
+
+ at g = external local_unnamed_addr global [0 x i32], align 4
+ at g1 = external global i32, align 4
+ at g2 = external global i32, align 4
+
+define i32 @f2(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-merge-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-merge-two-rounds.ll
new file mode 100644
index 00000000000000..06880e3d268189
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-merge-two-rounds.ll
@@ -0,0 +1,68 @@
+; TODO: This test checks if the how similar functions are handled during global outlining
+; by repeating the codegen via -codegen-data-thinlto-two-rounds=true.
+
+; RUN: rm -rf %t; split-file %s %t
+
+; RUN: opt -module-summary -module-hash %t/foo.ll -o %t-foo.bc
+; RUN: opt -module-summary -module-hash %t/goo.ll -o %t-goo.bc
+
+; RUN: llvm-lto2 run -enable-global-merge-func=true -codegen-data-thinlto-two-rounds=true %t-foo.bc %t-goo.bc -o %tout \
+; RUN:    -r %t-foo.bc,_f1,px \
+; RUN:    -r %t-goo.bc,_f2,px \
+; RUN:    -r %t-foo.bc,_g,l -r %t-foo.bc,_g1,l -r %t-foo.bc,_g2,l \
+; RUN:    -r %t-goo.bc,_g,l -r %t-goo.bc,_g1,l -r %t-goo.bc,_g2,l
+; RUN: llvm-nm %tout.1 | FileCheck %s --check-prefix=OUT1
+; RUN: llvm-nm %tout.2 | FileCheck %s --check-prefix=OUT2
+; RUN: llvm-objdump -d %tout.1 | FileCheck %s --check-prefix=THUNK1
+; RUN: llvm-objdump -d %tout.2 | FileCheck %s --check-prefix=THUNK2
+
+; OUT1: _f1.Tgm
+; OUT2: _f2.Tgm
+
+; THUNK1: <_f1>:
+; THUNK1-NEXT: adrp x1,
+; THUNK1-NEXT: ldr x1, [x1]
+; THUNK1-NEXT: b
+
+; THUNK2: <_f2>:
+; THUNK2-NEXT: adrp x1,
+; THUNK2-NEXT: ldr x1, [x1]
+; THUNK2-NEXT: b
+
+;--- foo.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-ios12.0.0"
+
+ at g = external local_unnamed_addr global [0 x i32], align 4
+ at g1 = external global i32, align 4
+ at g2 = external global i32, align 4
+
+define i32 @f1(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
+
+;--- goo.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-ios12.0.0"
+
+ at g = external local_unnamed_addr global [0 x i32], align 4
+ at g1 = external global i32, align 4
+ at g2 = external global i32, align 4
+
+define i32 @f2(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll b/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
new file mode 100644
index 00000000000000..a4022eb885b43f
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-merge-write.ll
@@ -0,0 +1,97 @@
+; This test verifies whether a stable function is encoded into the __llvm_merge section
+; when the -codegen-data-generate flag is used under -enable-global-merge-func=true.
+
+; RUN: rm -rf %t; split-file %s %t
+
+; RUN: opt -module-summary -module-hash %t/foo.ll -o %t-foo.bc
+; RUN: opt -module-summary -module-hash %t/goo.ll -o %t-goo.bc
+
+; RUN: llvm-lto2 run -enable-global-merge-func=true -codegen-data-generate=false %t-foo.bc %t-goo.bc -o %tout-nowrite \
+; RUN:    -r %t-foo.bc,_f1,px \
+; RUN:    -r %t-goo.bc,_f2,px \
+; RUN:    -r %t-foo.bc,_g,l -r %t-foo.bc,_g1,l -r %t-foo.bc,_g2,l \
+; RUN:    -r %t-goo.bc,_g,l -r %t-goo.bc,_g1,l -r %t-goo.bc,_g2,l
+; RUN: llvm-nm %tout-nowrite.1 | FileCheck %s --check-prefix=NOWRITE
+; RUN: llvm-nm %tout-nowrite.2 | FileCheck %s --check-prefix=NOWRITE
+
+; No merge instance is locally created as each module has a singltone function.
+; NOWRITE-NOT: _f1.Tgm
+; NOWRITE-NOT: _f2.Tgm
+
+; RUN: llvm-lto2 run -enable-global-merge-func=true -codegen-data-generate=true %t-foo.bc %t-goo.bc -o %tout-nowrite \
+; RUN:    -r %t-foo.bc,_f1,px \
+; RUN:    -r %t-goo.bc,_f2,px \
+; RUN:    -r %t-foo.bc,_g,l -r %t-foo.bc,_g1,l -r %t-foo.bc,_g2,l \
+; RUN:    -r %t-goo.bc,_g,l -r %t-goo.bc,_g1,l -r %t-goo.bc,_g2,l
+; RUN: llvm-nm %tout-nowrite.1 | FileCheck %s --check-prefix=WRITE
+; RUN: llvm-nm %tout-nowrite.2 | FileCheck %s --check-prefix=WRITE
+; RUN: llvm-objdump -h %tout-nowrite.1 | FileCheck %s --check-prefix=SECTNAME
+; RUN: llvm-objdump -h %tout-nowrite.2 | FileCheck %s --check-prefix=SECTNAME
+
+; On a write mode, no merging happens yet for each module.
+; We only create stable functions and publish them into __llvm_merge section for each object.
+; WRITE-NOT: _f1.Tgm
+; WRITE-NOT: _f2.Tgm
+; SECTNAME: __llvm_merge
+
+; Merge the cgdata using llvm-cgdata.
+; We now validate the content of the merged cgdata.
+; Two functions have the same hash with only one different constnat at a same location.
+; RUN: llvm-cgdata --merge -o %tout.cgdata %tout-nowrite.1 %tout-nowrite.2
+; RUN: llvm-cgdata --convert %tout.cgdata   -o - | FileCheck %s
+
+; CHECK:      - Hash: [[#%d,HASH:]]
+; CHECK-NEXT:   FunctionName: f1
+; CHECK-NEXT:   ModuleName: {{.*}}
+; CHECK-NEXT:   InstCount: [[#%d,INSTCOUNT:]]
+; CHECK-NEXT:   IndexOperandHashes:
+; CHECK-NEXT:     - InstIndex: [[#%d,INSTINDEX:]]
+; CHECK-NEXT:       OpndIndex: [[#%d,OPNDINDEX:]]
+; CHECK-NEXT:       OpndHash: {{.*}}
+
+; CHECK:      - Hash: [[#%d,HASH]]
+; CHECK-NEXT:   FunctionName: f2
+; CHECK-NEXT:   ModuleName: {{.*}}
+; CHECK-NEXT:   InstCount: [[#%d,INSTCOUNT]]
+; CHECK-NEXT:   IndexOperandHashes:
+; CHECK-NEXT:     - InstIndex: [[#%d,INSTINDEX]]
+; CHECK-NEXT:       OpndIndex: [[#%d,OPNDINDEX]]
+; CHECK-NEXT:       OpndHash: {{.*}}
+
+;--- foo.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-ios12.0.0"
+
+ at g = external local_unnamed_addr global [0 x i32], align 4
+ at g1 = external global i32, align 4
+ at g2 = external global i32, align 4
+
+define i32 @f1(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g1, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
+
+;--- goo.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-unknown-ios12.0.0"
+
+ at g = external local_unnamed_addr global [0 x i32], align 4
+ at g1 = external global i32, align 4
+ at g2 = external global i32, align 4
+
+define i32 @f2(i32 %a) {
+entry:
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* @g, i64 0, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = load volatile i32, i32* @g2, align 4
+  %mul = mul nsw i32 %1, %0
+  %add = add nsw i32 %mul, 1
+  ret i32 %add
+}
diff --git a/llvm/tools/llvm-lto2/CMakeLists.txt b/llvm/tools/llvm-lto2/CMakeLists.txt
index 3b4644d6e27715..6ddccc7f17442f 100644
--- a/llvm/tools/llvm-lto2/CMakeLists.txt
+++ b/llvm/tools/llvm-lto2/CMakeLists.txt
@@ -6,6 +6,7 @@ set(LLVM_LINK_COMPONENTS
   BitReader
   CodeGen
   Core
+  IPO
   Linker
   LTO
   MC
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index d4f022ef021a44..0dc6623552a4bd 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Transforms/IPO.h"
 #include <atomic>
 
 using namespace llvm;
@@ -195,6 +196,7 @@ static cl::opt<bool> TryUseNewDbgInfoFormat(
 extern cl::opt<bool> UseNewDbgInfoFormat;
 extern cl::opt<cl::boolOrDefault> LoadBitcodeIntoNewDbgInfoFormat;
 extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
+extern cl::opt<bool> EnableGlobalMergeFunc;
 
 static void check(Error E, std::string Msg) {
   if (!E)
@@ -374,6 +376,10 @@ static int run(int argc, char **argv) {
     if (DI.getSeverity() == DS_Error)
       HasErrors = true;
   };
+  Conf.PreCodeGenPassesHook = [](legacy::PassManager &pm) {
+    if (EnableGlobalMergeFunc)
+      pm.add(createGlobalMergeFuncPass());
+  };
 
   LTO::LTOKind LTOMode = LTO::LTOK_Default;
 

>From e86d78bfa0961a1de88569e18ef5707aeaf680ea Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 18 Oct 2024 10:12:21 -0700
Subject: [PATCH 2/4] Address comments from tschuett

---
 llvm/include/llvm/Transforms/IPO.h            |  3 ++
 .../Transforms/IPO/GlobalMergeFunctions.h     | 45 ++++++++--------
 .../Transforms/IPO/GlobalMergeFunctions.cpp   | 53 +++++++++----------
 3 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index 86a8654f56997c..28cf330ad20812 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -55,6 +55,9 @@ enum class PassSummaryAction {
   Export, ///< Export information to summary.
 };
 
+/// createGlobalMergeFuncPass - This pass generates merged instances by
+/// parameterizing distinct constants across similar functions, utilizing stable
+/// function hash information.
 Pass *createGlobalMergeFuncPass();
 
 } // End llvm namespace
diff --git a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
index 565be54f89e882..395aeda73b2d0f 100644
--- a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
+++ b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
@@ -5,23 +5,29 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-///
-/// This file defines global merge functions pass and related data structure.
-///
+//
+// This pass defines the implementation of a function merging mechanism
+// that utilizes a stable function hash to track differences in constants and
+// identify potential merge candidates. The process involves two rounds:
+// 1. The first round collects stable function hashes and identifies merge
+//    candidates with matching hashes. It also computes the set of parameters
+//    that point to different constants during the stable function merge.
+// 2. The second round leverages this collected global function information to
+//    optimistically create a merged function in each module context, ensuring
+//    correct transformation.
+// Similar to the global outliner, this approach uses the linker's deduplication
+// (ICF) to fold identical merged functions, thereby reducing the final binary
+// size. The work is inspired by the concepts discussed in the following paper:
+// https://dl.acm.org/doi/pdf/10.1145/3652032.3657575.
+//
 //===----------------------------------------------------------------------===//
 
-#ifndef PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
-#define PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
+#ifndef LLVM_TRANSFORMS_IPO_GLOBALMERGEFUNCTIONS_H
+#define LLVM_TRANSFORMS_IPO_GLOBALMERGEFUNCTIONS_H
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/StableHashing.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/CGData/StableFunctionMap.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include <map>
-#include <mutex>
 
 enum class HashFunctionMode {
   Local,
@@ -36,15 +42,10 @@ namespace llvm {
 using ParamLocs = SmallVector<IndexPair, 4>;
 // A vector of parameters
 using ParamLocsVecTy = SmallVector<ParamLocs, 8>;
-// A map of stable hash to a vector of stable functions
-
-/// GlobalMergeFunc finds functions which only differ by constants in
-/// certain instructions, e.g. resulting from specialized functions of layout
-/// compatible types.
-/// Unlike PikaMergeFunc that directly compares IRs, this uses stable function
-/// hash to find the merge candidate. Similar to the global outliner, we can run
-/// codegen twice to collect function merge candidate in the first round, and
-/// merge functions globally in the second round.
+
+/// GlobalMergeFunc is a ModulePass that implements a function merging mechanism
+/// using stable function hashes. It identifies and merges functions with
+/// matching hashes across modules to optimize binary size.
 class GlobalMergeFunc : public ModulePass {
   HashFunctionMode MergerMode = HashFunctionMode::Local;
 
@@ -69,9 +70,9 @@ class GlobalMergeFunc : public ModulePass {
   /// Emit LocalFunctionMap into __llvm_merge section.
   void emitFunctionMap(Module &M);
 
-  /// Merge functions in the module using the global function map.
+  /// Merge functions in the module using the given function map.
   bool merge(Module &M, const StableFunctionMap *FunctionMap);
 };
 
 } // end namespace llvm
-#endif // PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
+#endif // LLVM_TRANSFORMS_IPO_GLOBALMERGEFUNCTIONS_H
diff --git a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
index 6fd1a66b5a197c..5a52c8f3304186 100644
--- a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
@@ -6,16 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// TODO:  This implements a function merge using function hash while tracking
-// differences in Constants. This uses stable function hash to find potential
-// merge candidates. The first codegen round collects stable function hashes,
-// and determines the merge candidates that match the stable function hashes.
-// The set of parameters pointing to different Constants are also computed
-// during the stable function merge. The second codegen round uses this global
-// function info to optimistically create a merged function in each module
-// context to guarantee correct transformation. Similar to the global outliner,
-// the linker's deduplication (ICF) folds the identical merged functions to save
-// the final binary size.
+// This pass defines the implementation of a function merging mechanism
+// that utilizes a stable function hash to track differences in constants and
+// create potential merge candidates. The process involves two rounds:
+// 1. The first round collects stable function hashes and identifies merge
+//    candidates with matching hashes. It also computes the set of parameters
+//    that point to different constants during the stable function merge.
+// 2. The second round leverages this collected global function information to
+//    optimistically create a merged function in each module context, ensuring
+//    correct transformation.
+// Similar to the global outliner, this approach uses the linker's deduplication
+// (ICF) to fold identical merged functions, thereby reducing the final binary
+// size. The work is inspired by the concepts discussed in the following paper:
+// https://dl.acm.org/doi/pdf/10.1145/3652032.3657575.
 //
 //===----------------------------------------------------------------------===//
 
@@ -23,9 +26,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/CGData/CodeGenData.h"
-#include "llvm/CGData/StableFunctionMap.h"
-#include "llvm/CodeGen/MachineStableHash.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/StructuralHash.h"
 #include "llvm/InitializePasses.h"
@@ -84,7 +84,7 @@ STATISTIC(NumAnalyzedModues, "Number of modules that are analyzed");
 STATISTIC(NumAnalyzedFunctions, "Number of functions that are analyzed");
 STATISTIC(NumEligibleFunctions, "Number of functions that are eligible");
 
-/// Returns true if the \opIdx operand of \p CI is the callee operand.
+/// Returns true if the \OpIdx operand of \p CI is the callee operand.
 static bool isCalleeOperand(const CallBase *CI, unsigned OpIdx) {
   return &CI->getCalledOperandUse() == &CI->getOperandUse(OpIdx);
 }
@@ -148,22 +148,19 @@ bool isEligibleFunction(Function *F) {
   if (F->hasFnAttribute(llvm::Attribute::NoMerge))
     return false;
 
-  if (F->hasAvailableExternallyLinkage()) {
+  if (F->hasAvailableExternallyLinkage())
     return false;
-  }
 
-  if (F->getFunctionType()->isVarArg()) {
+  if (F->getFunctionType()->isVarArg())
     return false;
-  }
 
   if (F->getCallingConv() == CallingConv::SwiftTail)
     return false;
 
-  // if function contains callsites with musttail, if we merge
+  // If function contains callsites with musttail, if we merge
   // it, the merged function will have the musttail callsite, but
   // the number of parameters can change, thus the parameter count
   // of the callsite will mismatch with the function itself.
-  // if (IgnoreMusttailFunction) {
   for (const BasicBlock &BB : *F) {
     for (const Instruction &I : BB) {
       const auto *CB = dyn_cast<CallBase>(&I);
@@ -203,7 +200,6 @@ static bool ignoreOp(const Instruction *I, unsigned OpIdx) {
   return true;
 }
 
-// copy from merge functions.cpp
 static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
   Type *SrcTy = V->getType();
   if (SrcTy->isStructTy()) {
@@ -252,7 +248,8 @@ void GlobalMergeFunc::analyze(Module &M) {
 
       auto FI = llvm::StructuralHashWithDifferences(Func, ignoreOp);
 
-      // Convert the map to a vector for a serialization-friendly format.
+      // Convert the operand map to a vector for a serialization-friendly
+      // format.
       IndexOperandHashVecType IndexOperandHashes;
       for (auto &Pair : *FI.IndexOperandHashMap)
         IndexOperandHashes.emplace_back(Pair);
@@ -597,7 +594,7 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         // This module check is not strictly necessary as the functions can move
         // around. We just want to avoid merging functions from different
         // modules than the first one in the functon map, as they may not end up
-        // with not being ICFed.
+        // with not being ICFed by the linker.
         if (MergedModId != *FunctionMap->getNameForId(SF->ModuleNameId)) {
           ++NumMismatchedModuleIdGlobalMergeFunction;
           continue;
@@ -618,12 +615,12 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
       dbgs() << "[GlobalMergeFunc] Merging function count " << FuncMergeInfoSize
              << " in  " << ModId << "\n";
     });
+
     for (auto &FMI : FuncMergeInfos) {
       Changed = true;
 
       // We've already validated all locations of constant operands pointed by
-      // the parameters. Just use the first one to bookkeep the original
-      // constants for each parameter
+      // the parameters. Populate parameters pointing to the original constants.
       SmallVector<Constant *> Params;
       SmallVector<Type *> ParamTypes;
       for (auto &ParamLocs : ParamLocsVec) {
@@ -635,8 +632,7 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         ParamTypes.push_back(Opnd->getType());
       }
 
-      // Create a merged function derived from the first function in the current
-      // module context.
+      // Create a merged function derived from the current function.
       Function *MergedFunc =
           createMergedFunction(FMI, ParamTypes, ParamLocsVec);
 
@@ -647,7 +643,8 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         MergedFunc->dump();
       });
 
-      // Create a thunk to the merged function.
+      // Transform the current function into a thunk that calls the merged
+      // function.
       createThunk(FMI, Params, MergedFunc);
       LLVM_DEBUG({
         dbgs() << "[GlobalMergeFunc] Thunk generated: \n";

>From ac87d92b8bcbb187f24908d0b993c4c39dfb435b Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 9 Nov 2024 17:27:02 -0800
Subject: [PATCH 3/4] address comments from ellishg

---
 .../Transforms/IPO/GlobalMergeFunctions.h     |   5 +
 .../Transforms/IPO/GlobalMergeFunctions.cpp   | 126 ++++++------------
 2 files changed, 47 insertions(+), 84 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
index 395aeda73b2d0f..bf2ca6c8977580 100644
--- a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
+++ b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
@@ -54,6 +54,11 @@ class GlobalMergeFunc : public ModulePass {
 public:
   static char ID;
 
+  /// The suffix used to identify the merged function that parameterizes
+  /// the constant values. Note that the original function, without this suffix,
+  /// becomes a thunk supplying contexts to the merged function via parameters.
+  static constexpr const char MergingInstanceSuffix[] = ".Tgm";
+
   GlobalMergeFunc();
 
   StringRef getPassName() const override;
diff --git a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
index 5a52c8f3304186..bae5c5b3c8193f 100644
--- a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
@@ -6,19 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This pass defines the implementation of a function merging mechanism
-// that utilizes a stable function hash to track differences in constants and
-// create potential merge candidates. The process involves two rounds:
-// 1. The first round collects stable function hashes and identifies merge
-//    candidates with matching hashes. It also computes the set of parameters
-//    that point to different constants during the stable function merge.
-// 2. The second round leverages this collected global function information to
-//    optimistically create a merged function in each module context, ensuring
-//    correct transformation.
-// Similar to the global outliner, this approach uses the linker's deduplication
-// (ICF) to fold identical merged functions, thereby reducing the final binary
-// size. The work is inspired by the concepts discussed in the following paper:
-// https://dl.acm.org/doi/pdf/10.1145/3652032.3657575.
+// This pass implements the global merge function pass.
 //
 //===----------------------------------------------------------------------===//
 
@@ -70,15 +58,15 @@ static cl::opt<unsigned> GlobalMergingExtraThreshold(
 
 extern cl::opt<bool> EnableGlobalMergeFunc;
 
-STATISTIC(NumMismatchedFunctionHashGlobalMergeFunction,
+STATISTIC(NumMismatchedFunctionHash,
           "Number of mismatched function hash for global merge function");
-STATISTIC(NumMismatchedInstCountGlobalMergeFunction,
+STATISTIC(NumMismatchedInstCount,
           "Number of mismatched instruction count for global merge function");
-STATISTIC(NumMismatchedConstHashGlobalMergeFunction,
+STATISTIC(NumMismatchedConstHash,
           "Number of mismatched const hash for global merge function");
-STATISTIC(NumMismatchedModuleIdGlobalMergeFunction,
+STATISTIC(NumMismatchedModuleId,
           "Number of mismatched Module Id for global merge function");
-STATISTIC(NumGlobalMergeFunctions,
+STATISTIC(NumMergedFunctions,
           "Number of functions that are actually merged using function hash");
 STATISTIC(NumAnalyzedModues, "Number of modules that are analyzed");
 STATISTIC(NumAnalyzedFunctions, "Number of functions that are analyzed");
@@ -112,34 +100,6 @@ static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) {
   return true;
 }
 
-bool isEligibleInstrunctionForConstantSharing(const Instruction *I) {
-  switch (I->getOpcode()) {
-  case Instruction::Load:
-  case Instruction::Store:
-  case Instruction::Call:
-  case Instruction::Invoke:
-    return true;
-  default:
-    return false;
-  }
-}
-
-bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx) {
-  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
-
-  if (!isEligibleInstrunctionForConstantSharing(I))
-    return false;
-
-  auto Opnd = I->getOperand(OpIdx);
-  if (!isa<Constant>(Opnd))
-    return false;
-
-  if (const auto *CI = dyn_cast<CallBase>(I))
-    return canParameterizeCallOperand(CI, OpIdx);
-
-  return true;
-}
-
 /// Returns true if function \p F is eligible for merging.
 bool isEligibleFunction(Function *F) {
   if (F->isDeclaration())
@@ -172,8 +132,7 @@ bool isEligibleFunction(Function *F) {
   return true;
 }
 
-static bool
-isEligibleInstrunctionForConstantSharingLocal(const Instruction *I) {
+static bool isEligibleInstrunctionForConstantSharing(const Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::Load:
   case Instruction::Store:
@@ -188,7 +147,7 @@ isEligibleInstrunctionForConstantSharingLocal(const Instruction *I) {
 static bool ignoreOp(const Instruction *I, unsigned OpIdx) {
   assert(OpIdx < I->getNumOperands() && "Invalid operand index");
 
-  if (!isEligibleInstrunctionForConstantSharingLocal(I))
+  if (!isEligibleInstrunctionForConstantSharing(I))
     return false;
 
   if (!isa<Constant>(I->getOperand(OpIdx)))
@@ -233,10 +192,9 @@ static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
   assert(!DestTy->isArrayTy());
   if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
     return Builder.CreateIntToPtr(V, DestTy);
-  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+  if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
     return Builder.CreatePtrToInt(V, DestTy);
-  else
-    return Builder.CreateBitCast(V, DestTy);
+  return Builder.CreateBitCast(V, DestTy);
 }
 
 void GlobalMergeFunc::analyze(Module &M) {
@@ -268,6 +226,9 @@ struct FuncMergeInfo {
   StableFunctionMap::StableFunctionEntry *SF;
   Function *F;
   std::unique_ptr<IndexInstrMap> IndexInstruction;
+  FuncMergeInfo(StableFunctionMap::StableFunctionEntry *SF, Function *F,
+                std::unique_ptr<IndexInstrMap> IndexInstruction)
+      : SF(SF), F(F), IndexInstruction(std::move(IndexInstruction)) {}
 };
 
 // Given the func info, and the parameterized locations, create and return
@@ -279,7 +240,8 @@ static Function *createMergedFunction(FuncMergeInfo &FI,
   // Synthesize a new merged function name by appending ".Tgm" to the root
   // function's name.
   auto *MergedFunc = FI.F;
-  auto NewFunctionName = MergedFunc->getName().str() + ".Tgm";
+  std::string NewFunctionName =
+      MergedFunc->getName().str() + GlobalMergeFunc::MergingInstanceSuffix;
   auto *M = MergedFunc->getParent();
   assert(!M->getFunction(NewFunctionName));
 
@@ -288,8 +250,8 @@ static Function *createMergedFunction(FuncMergeInfo &FI,
   SmallVector<Type *> ParamTypes(OrigTy->param_begin(), OrigTy->param_end());
   // Append const parameter types that are passed in.
   ParamTypes.append(ConstParamTypes.begin(), ConstParamTypes.end());
-  FunctionType *FuncType =
-      FunctionType::get(OrigTy->getReturnType(), ParamTypes, false);
+  FunctionType *FuncType = FunctionType::get(OrigTy->getReturnType(),
+                                             ParamTypes, /*isVarArg=*/false);
 
   // Declare a new function
   Function *NewFunction =
@@ -326,8 +288,9 @@ static Function *createMergedFunction(FuncMergeInfo &FI,
         IRBuilder<> Builder(Inst->getParent(), Inst->getIterator());
         Inst->setOperand(OpndIndex,
                          createCast(Builder, NewArg, OrigC->getType()));
-      } else
+      } else {
         Inst->setOperand(OpndIndex, NewArg);
+      }
     }
   }
 
@@ -336,7 +299,6 @@ static Function *createMergedFunction(FuncMergeInfo &FI,
 
 // Given the original function (Thunk) and the merged function (ToFunc), create
 // a thunk to the merged function.
-
 static void createThunk(FuncMergeInfo &FI, ArrayRef<Constant *> Params,
                         Function *ToFunc) {
   auto *Thunk = FI.F;
@@ -361,7 +323,6 @@ static void createThunk(FuncMergeInfo &FI, ArrayRef<Constant *> Params,
   // Add new arguments defined by Params.
   for (auto *Param : Params) {
     assert(ParamIdx < ToFuncTy->getNumParams());
-    // FIXME: do not support signing
     Args.push_back(
         createCast(Builder, Param, ToFuncTy->getParamType(ParamIdx)));
     ++ParamIdx;
@@ -431,8 +392,9 @@ checkConstLocationCompatible(const StableFunctionMap::StableFunctionEntry &SF,
       if (!OldHash) {
         OldHash = CurrHash;
         OldConst = CurrConst;
-      } else if (CurrConst != *OldConst || CurrHash != *OldHash)
+      } else if (CurrConst != *OldConst || CurrHash != *OldHash) {
         return false;
+      }
     }
   }
   return true;
@@ -454,8 +416,7 @@ static ParamLocsVecTy computeParamInfo(
     bool Identical = true;
     for (unsigned J = 1; J < StableFunctionCount; ++J) {
       auto &SF = SFS[J];
-      assert(SF->IndexOperandHashMap->count(IndexPair));
-      auto SHash = (*SF->IndexOperandHashMap)[IndexPair];
+      auto SHash = SF->IndexOperandHashMap->at(IndexPair);
       if (Hash != SHash)
         Identical = false;
       ConstHashSeq.push_back(SHash);
@@ -471,9 +432,9 @@ static ParamLocsVecTy computeParamInfo(
   ParamLocsVecTy ParamLocsVec;
   for (auto &[HashSeq, Locs] : HashSeqToLocs) {
     ParamLocsVec.push_back(std::move(Locs));
-    std::sort(
-        ParamLocsVec.begin(), ParamLocsVec.end(),
-        [&](const ParamLocs &L, const ParamLocs &R) { return L[0] < R[0]; });
+    llvm::sort(ParamLocsVec, [&](const ParamLocs &L, const ParamLocs &R) {
+      return L[0] < R[0];
+    });
   }
   return ParamLocsVec;
 }
@@ -527,10 +488,8 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
     // Compute the parameter locations based on the unique hash sequences
     // across the candidates.
     auto ParamLocsVec = computeParamInfo(SFS);
-    LLVM_DEBUG({
-      dbgs() << "[GlobalMergeFunc] Merging hash: " << Hash << " with Params "
-             << ParamLocsVec.size() << "\n";
-    });
+    LLVM_DEBUG(dbgs() << "[GlobalMergeFunc] Merging hash: " << Hash
+                      << " with Params " << ParamLocsVec.size() << "\n");
 
     Function *MergedFunc = nullptr;
     std::string MergedModId;
@@ -553,12 +512,12 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
       auto FI = llvm::StructuralHashWithDifferences(*F, ignoreOp);
       uint64_t FuncHash = FI.FunctionHash;
       if (Hash != FuncHash) {
-        ++NumMismatchedFunctionHashGlobalMergeFunction;
+        ++NumMismatchedFunctionHash;
         continue;
       }
 
       if (SF->InstCount != FI.IndexInstruction->size()) {
-        ++NumMismatchedInstCountGlobalMergeFunction;
+        ++NumMismatchedInstCount;
         continue;
       }
       bool HasValidSharedConst = true;
@@ -566,23 +525,23 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         auto [InstIndex, OpndIndex] = Index;
         assert(InstIndex < FI.IndexInstruction->size());
         auto *Inst = FI.IndexInstruction->lookup(InstIndex);
-        if (!isEligibleOperandForConstantSharing(Inst, OpndIndex)) {
+        if (!ignoreOp(Inst, OpndIndex)) {
           HasValidSharedConst = false;
           break;
         }
       }
       if (!HasValidSharedConst) {
-        ++NumMismatchedConstHashGlobalMergeFunction;
+        ++NumMismatchedConstHash;
         continue;
       }
       if (!checkConstHashCompatible(*SF->IndexOperandHashMap,
                                     *FI.IndexOperandHashMap)) {
-        ++NumMismatchedConstHashGlobalMergeFunction;
+        ++NumMismatchedConstHash;
         continue;
       }
       if (!checkConstLocationCompatible(*SF, *FI.IndexInstruction,
                                         ParamLocsVec)) {
-        ++NumMismatchedConstHashGlobalMergeFunction;
+        ++NumMismatchedConstHash;
         continue;
       }
 
@@ -596,7 +555,7 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         // modules than the first one in the functon map, as they may not end up
         // with not being ICFed by the linker.
         if (MergedModId != *FunctionMap->getNameForId(SF->ModuleNameId)) {
-          ++NumMismatchedModuleIdGlobalMergeFunction;
+          ++NumMismatchedModuleId;
           continue;
         }
       } else {
@@ -604,17 +563,15 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         MergedModId = *FunctionMap->getNameForId(SF->ModuleNameId);
       }
 
-      FuncMergeInfos.push_back({SF.get(), F, std::move(FI.IndexInstruction)});
+      FuncMergeInfos.emplace_back(SF.get(), F, std::move(FI.IndexInstruction));
       MergedFunctions.insert(F);
     }
     unsigned FuncMergeInfoSize = FuncMergeInfos.size();
     if (FuncMergeInfoSize == 0)
       continue;
 
-    LLVM_DEBUG({
-      dbgs() << "[GlobalMergeFunc] Merging function count " << FuncMergeInfoSize
-             << " in  " << ModId << "\n";
-    });
+    LLVM_DEBUG(dbgs() << "[GlobalMergeFunc] Merging function count "
+                      << FuncMergeInfoSize << " in  " << ModId << "\n");
 
     for (auto &FMI : FuncMergeInfos) {
       Changed = true;
@@ -650,7 +607,7 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         dbgs() << "[GlobalMergeFunc] Thunk generated: \n";
         FMI.F->dump();
       });
-      ++NumGlobalMergeFunctions;
+      ++NumMergedFunctions;
     }
   }
 
@@ -682,8 +639,10 @@ Pass *createGlobalMergeFuncPass() { return new GlobalMergeFunc(); }
 } // namespace llvm
 
 void GlobalMergeFunc::initializeMergerMode(const Module &M) {
+  // Initialize the local function map regardless of the merger mode.
   LocalFunctionMap = std::make_unique<StableFunctionMap>();
 
+  // Skip the global merge mode. The local merge is still enabled.
   if (DisableGlobalMerging)
     return;
 
@@ -703,9 +662,8 @@ void GlobalMergeFunc::initializeMergerMode(const Module &M) {
 }
 
 void GlobalMergeFunc::emitFunctionMap(Module &M) {
-  LLVM_DEBUG({
-    dbgs() << "Emit function map. Size: " << LocalFunctionMap->size() << "\n";
-  });
+  LLVM_DEBUG(dbgs() << "Emit function map. Size: " << LocalFunctionMap->size()
+                    << "\n");
   // No need to emit the function map if it is empty.
   if (LocalFunctionMap->empty())
     return;

>From 1d2436e3c578ccf2e06fa708eef23c6dae75412d Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 9 Nov 2024 23:47:09 -0800
Subject: [PATCH 4/4] Address comments from nocchijiang

---
 .../IPO => CodeGen}/GlobalMergeFunctions.h    |  26 +++--
 llvm/include/llvm/CodeGen/Passes.h            |   3 +
 llvm/include/llvm/InitializePasses.h          |   2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |   4 +
 .../llvm/Passes/MachinePassRegistry.def       |   1 +
 .../include/llvm/Target/CGPassBuilderOption.h |   1 +
 llvm/include/llvm/Transforms/IPO.h            |   5 -
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 .../IPO => CodeGen}/GlobalMergeFunctions.cpp  | 110 +++++++++++-------
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   4 +
 llvm/lib/LTO/LTO.cpp                          |   1 -
 llvm/lib/Passes/PassBuilder.cpp               |   1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Transforms/IPO/CMakeLists.txt        |   2 -
 .../ThinLTO/AArch64/cgdata-merge-local.ll     |  65 +++++++----
 llvm/tools/llvm-lto2/CMakeLists.txt           |   1 -
 llvm/tools/llvm-lto2/llvm-lto2.cpp            |   6 -
 18 files changed, 142 insertions(+), 93 deletions(-)
 rename llvm/include/llvm/{Transforms/IPO => CodeGen}/GlobalMergeFunctions.h (84%)
 rename llvm/lib/{Transforms/IPO => CodeGen}/GlobalMergeFunctions.cpp (92%)

diff --git a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
similarity index 84%
rename from llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
rename to llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
index bf2ca6c8977580..982a66270f616e 100644
--- a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
+++ b/llvm/include/llvm/CodeGen/GlobalMergeFunctions.h
@@ -22,11 +22,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_IPO_GLOBALMERGEFUNCTIONS_H
-#define LLVM_TRANSFORMS_IPO_GLOBALMERGEFUNCTIONS_H
+#ifndef LLVM_CODEGEN_GLOBALMERGEFUNCTIONS_H
+#define LLVM_CODEGEN_GLOBALMERGEFUNCTIONS_H
 
 #include "llvm/CGData/StableFunctionMap.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 
 enum class HashFunctionMode {
@@ -46,28 +47,24 @@ using ParamLocsVecTy = SmallVector<ParamLocs, 8>;
 /// GlobalMergeFunc is a ModulePass that implements a function merging mechanism
 /// using stable function hashes. It identifies and merges functions with
 /// matching hashes across modules to optimize binary size.
-class GlobalMergeFunc : public ModulePass {
+class GlobalMergeFunc {
   HashFunctionMode MergerMode = HashFunctionMode::Local;
 
   std::unique_ptr<StableFunctionMap> LocalFunctionMap;
 
-public:
-  static char ID;
+  const ModuleSummaryIndex *Index;
 
+public:
   /// The suffix used to identify the merged function that parameterizes
   /// the constant values. Note that the original function, without this suffix,
   /// becomes a thunk supplying contexts to the merged function via parameters.
   static constexpr const char MergingInstanceSuffix[] = ".Tgm";
 
-  GlobalMergeFunc();
-
-  StringRef getPassName() const override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  GlobalMergeFunc(const ModuleSummaryIndex *Index) : Index(Index){};
 
   void initializeMergerMode(const Module &M);
 
-  bool runOnModule(Module &M) override;
+  bool run(Module &M);
 
   /// Analyze module to create stable function into LocalFunctionMap.
   void analyze(Module &M);
@@ -79,5 +76,10 @@ class GlobalMergeFunc : public ModulePass {
   bool merge(Module &M, const StableFunctionMap *FunctionMap);
 };
 
+/// Global function merging pass for new pass manager.
+struct GlobalMergeFuncPass : public PassInfoMixin<GlobalMergeFuncPass> {
+  PreservedAnalyses run(Module &M, AnalysisManager<Module> &);
+};
+
 } // end namespace llvm
-#endif // LLVM_TRANSFORMS_IPO_GLOBALMERGEFUNCTIONS_H
+#endif // LLVM_CODEGEN_GLOBALMERGEFUNCTIONS_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 72054ab1d3c21a..708ff464b38093 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -507,6 +507,9 @@ namespace llvm {
   /// This pass frees the memory occupied by the MachineFunction.
   FunctionPass *createFreeMachineFunctionPass();
 
+  /// This pass performs merging similar functions globally.
+  ModulePass *createGlobalMergeFuncPass();
+
   /// This pass performs outlining on machine instructions directly before
   /// printing assembly.
   ModulePass *createMachineOutlinerPass(bool RunOnAllFunctions = true);
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index a43a96e90543cc..b27ecd8cdcb8c1 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -123,7 +123,7 @@ void initializeGCEmptyBasicBlocksPass(PassRegistry &);
 void initializeGCMachineCodeAnalysisPass(PassRegistry &);
 void initializeGCModuleInfoPass(PassRegistry &);
 void initializeGVNLegacyPassPass(PassRegistry &);
-void initializeGlobalMergeFuncPass(PassRegistry &);
+void initializeGlobalMergeFuncPassWrapperPass(PassRegistry &);
 void initializeGlobalMergePass(PassRegistry &);
 void initializeGlobalsAAWrapperPassPass(PassRegistry &);
 void initializeHardwareLoopsLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 9e95625fd1d881..3f7d226c7cbffb 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/FinalizeISel.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GlobalMerge.h"
+#include "llvm/CodeGen/GlobalMergeFunctions.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
 #include "llvm/CodeGen/InterleavedAccess.h"
 #include "llvm/CodeGen/InterleavedLoadCombine.h"
@@ -713,6 +714,9 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addIRPasses(
   // Convert conditional moves to conditional jumps when profitable.
   if (getOptLevel() != CodeGenOptLevel::None && !Opt.DisableSelectOptimize)
     addPass(SelectOptimizePass(&TM));
+
+  if (Opt.EnableGlobalMergeFunc)
+    addPass(GlobalMergeFuncPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 9d12a120ff7ac6..3ceb5ca7d18eda 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -29,6 +29,7 @@ MODULE_PASS("jmc-instrumenter", JMCInstrumenterPass())
 MODULE_PASS("lower-emutls", LowerEmuTLSPass())
 MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass())
 MODULE_PASS("shadow-stack-gc-lowering", ShadowStackGCLoweringPass())
+MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
 #undef MODULE_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index 8ab6d63a00056a..29bdb9c1746d3c 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -31,6 +31,7 @@ struct CGPassBuilderOption {
   bool DisableVerify = false;
   bool EnableImplicitNullChecks = false;
   bool EnableBlockPlacementStats = false;
+  bool EnableGlobalMergeFunc = false;
   bool EnableMachineFunctionSplitter = false;
   bool MISchedPostRA = false;
   bool EarlyLiveIntervals = false;
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index 28cf330ad20812..ee0e35aa618325 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -55,11 +55,6 @@ enum class PassSummaryAction {
   Export, ///< Export information to summary.
 };
 
-/// createGlobalMergeFuncPass - This pass generates merged instances by
-/// parameterizing distinct constants across similar functions, utilizing stable
-/// function hash information.
-Pass *createGlobalMergeFuncPass();
-
 } // End llvm namespace
 
 #endif
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 5a17944db0ae03..5f0496a013795d 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -71,6 +71,7 @@ add_llvm_component_library(LLVMCodeGen
   GCMetadataPrinter.cpp
   GCRootLowering.cpp
   GlobalMerge.cpp
+  GlobalMergeFunctions.cpp
   HardwareLoops.cpp
   IfConversion.cpp
   ImplicitNullChecks.cpp
diff --git a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
similarity index 92%
rename from llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
rename to llvm/lib/CodeGen/GlobalMergeFunctions.cpp
index bae5c5b3c8193f..46b4ea76b655ff 100644
--- a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
+++ b/llvm/lib/CodeGen/GlobalMergeFunctions.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO/GlobalMergeFunctions.h"
+#include "llvm/CodeGen/GlobalMergeFunctions.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/CGData/CodeGenData.h"
@@ -56,8 +56,6 @@ static cl::opt<unsigned> GlobalMergingExtraThreshold(
              "to be considered beneficial."),
     cl::init(0), cl::Hidden);
 
-extern cl::opt<bool> EnableGlobalMergeFunc;
-
 STATISTIC(NumMismatchedFunctionHash,
           "Number of mismatched function hash for global merge function");
 STATISTIC(NumMismatchedInstCount,
@@ -87,8 +85,12 @@ static bool canParameterizeCallOperand(const CallBase *CI, unsigned OpIdx) {
   if (Callee) {
     if (Callee->isIntrinsic())
       return false;
+    auto Name = Callee->getName();
     // objc_msgSend stubs must be called, and can't have their address taken.
-    if (Callee->getName().starts_with("objc_msgSend$"))
+    if (Name.starts_with("objc_msgSend$"))
+      return false;
+    // Calls to dtrace probes must generate unique patchpoints.
+    if (Name.starts_with("__dtrace"))
       return false;
   }
   if (isCalleeOperand(CI, OpIdx) &&
@@ -105,7 +107,8 @@ bool isEligibleFunction(Function *F) {
   if (F->isDeclaration())
     return false;
 
-  if (F->hasFnAttribute(llvm::Attribute::NoMerge))
+  if (F->hasFnAttribute(llvm::Attribute::NoMerge) ||
+      F->hasFnAttribute(llvm::Attribute::AlwaysInline))
     return false;
 
   if (F->hasAvailableExternallyLinkage())
@@ -552,8 +555,8 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
         // Check if the matched functions fall into the same (first) module.
         // This module check is not strictly necessary as the functions can move
         // around. We just want to avoid merging functions from different
-        // modules than the first one in the functon map, as they may not end up
-        // with not being ICFed by the linker.
+        // modules than the first one in the function map, as they may not end
+        // up with being ICFed by the linker.
         if (MergedModId != *FunctionMap->getNameForId(SF->ModuleNameId)) {
           ++NumMismatchedModuleId;
           continue;
@@ -614,30 +617,6 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
   return Changed;
 }
 
-char GlobalMergeFunc::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMergeFunc, "global-merge-func",
-                      "Global merge function pass", false, false)
-INITIALIZE_PASS_END(GlobalMergeFunc, "global-merge-func",
-                    "Global merge function pass", false, false)
-
-StringRef GlobalMergeFunc::getPassName() const {
-  return "Global Merge Functions";
-}
-
-void GlobalMergeFunc::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addUsedIfAvailable<ImmutableModuleSummaryIndexWrapperPass>();
-  AU.setPreservesAll();
-  ModulePass::getAnalysisUsage(AU);
-}
-
-GlobalMergeFunc::GlobalMergeFunc() : ModulePass(ID) {
-  initializeGlobalMergeFuncPass(*llvm::PassRegistry::getPassRegistry());
-}
-
-namespace llvm {
-Pass *createGlobalMergeFuncPass() { return new GlobalMergeFunc(); }
-} // namespace llvm
-
 void GlobalMergeFunc::initializeMergerMode(const Module &M) {
   // Initialize the local function map regardless of the merger mode.
   LocalFunctionMap = std::make_unique<StableFunctionMap>();
@@ -646,14 +625,10 @@ void GlobalMergeFunc::initializeMergerMode(const Module &M) {
   if (DisableGlobalMerging)
     return;
 
-  if (auto *IndexWrapperPass =
-          getAnalysisIfAvailable<ImmutableModuleSummaryIndexWrapperPass>()) {
-    auto *TheIndex = IndexWrapperPass->getIndex();
-    // (Full)LTO module does not have functions added to the index.
-    // In this case, we run a local merger without using codegen data.
-    if (TheIndex && !TheIndex->hasExportedFunctions(M))
-      return;
-  }
+  // (Full)LTO module does not have functions added to the index.
+  // In this case, we run a local merger without using codegen data.
+  if (Index && !Index->hasExportedFunctions(M))
+    return;
 
   if (cgdata::emitCGData())
     MergerMode = HashFunctionMode::BuildingHashFuncion;
@@ -681,7 +656,7 @@ void GlobalMergeFunc::emitFunctionMap(Module &M) {
                       Align(4));
 }
 
-bool GlobalMergeFunc::runOnModule(Module &M) {
+bool GlobalMergeFunc::run(Module &M) {
   initializeMergerMode(M);
 
   const StableFunctionMap *FuncMap;
@@ -700,3 +675,58 @@ bool GlobalMergeFunc::runOnModule(Module &M) {
 
   return merge(M, FuncMap);
 }
+
+namespace {
+
+class GlobalMergeFuncPassWrapper : public ModulePass {
+
+public:
+  static char ID;
+
+  GlobalMergeFuncPassWrapper();
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addUsedIfAvailable<ImmutableModuleSummaryIndexWrapperPass>();
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Global Merge Functions"; }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char GlobalMergeFuncPassWrapper::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalMergeFuncPassWrapper, "global-merge-func",
+                      "Global merge function pass", false, false)
+INITIALIZE_PASS_END(GlobalMergeFuncPassWrapper, "global-merge-func",
+                    "Global merge function pass", false, false)
+
+namespace llvm {
+ModulePass *createGlobalMergeFuncPass() {
+  return new GlobalMergeFuncPassWrapper();
+}
+} // namespace llvm
+
+GlobalMergeFuncPassWrapper::GlobalMergeFuncPassWrapper() : ModulePass(ID) {
+  initializeGlobalMergeFuncPassWrapperPass(
+      *llvm::PassRegistry::getPassRegistry());
+}
+
+bool GlobalMergeFuncPassWrapper::runOnModule(Module &M) {
+  const ModuleSummaryIndex *Index = nullptr;
+  if (auto *IndexWrapperPass =
+          getAnalysisIfAvailable<ImmutableModuleSummaryIndexWrapperPass>())
+    Index = IndexWrapperPass->getIndex();
+
+  return GlobalMergeFunc(Index).run(M);
+}
+
+PreservedAnalyses GlobalMergeFuncPass::run(Module &M,
+                                           AnalysisManager<Module> &AM) {
+  ModuleSummaryIndex *Index = &(AM.getResult<ModuleSummaryIndexAnalysis>(M));
+  bool Changed = GlobalMergeFunc(Index).run(M);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index beeb8ea2d078d9..f486351c1d6636 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -492,6 +492,7 @@ CGPassBuilderOption llvm::getCGPassBuilderOption() {
 
   SET_BOOLEAN_OPTION(EarlyLiveIntervals)
   SET_BOOLEAN_OPTION(EnableBlockPlacementStats)
+  SET_BOOLEAN_OPTION(EnableGlobalMergeFunc)
   SET_BOOLEAN_OPTION(EnableImplicitNullChecks)
   SET_BOOLEAN_OPTION(EnableMachineOutliner)
   SET_BOOLEAN_OPTION(MISchedPostRA)
@@ -890,6 +891,9 @@ void TargetPassConfig::addIRPasses() {
   // Convert conditional moves to conditional jumps when profitable.
   if (getOptLevel() != CodeGenOptLevel::None && !DisableSelectOptimize)
     addPass(createSelectOptimizePass());
+
+  if (EnableGlobalMergeFunc)
+    addPass(createGlobalMergeFuncPass());
 }
 
 /// Turn exception handling constructs into something the code generators can
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 3f88c6c475d4d8..0f53c608512171 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -53,7 +53,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/GlobalMergeFunctions.h"
 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index a879918005cad8..65eadd5aabbd89 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -90,6 +90,7 @@
 #include "llvm/CodeGen/FinalizeISel.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GlobalMerge.h"
+#include "llvm/CodeGen/GlobalMergeFunctions.h"
 #include "llvm/CodeGen/HardwareLoops.h"
 #include "llvm/CodeGen/IndirectBrExpand.h"
 #include "llvm/CodeGen/InterleavedAccess.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 16fe9a74bb9c0d..997c23a6bb5a05 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/GlobalMergeFunctions.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Passes/OptimizationLevel.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 017ae311c55eb4..0de532eb232f93 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -69,6 +69,7 @@ MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
 MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Disable))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
+MODULE_PASS("global-merge-func", GlobalMergeFuncPass())
 MODULE_PASS("globalopt", GlobalOptPass())
 MODULE_PASS("globalsplit", GlobalSplitPass())
 MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass())
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 6a36ed64149f93..15cb57399d2460 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_component_library(LLVMipo
   GlobalDCE.cpp
   GlobalOpt.cpp
   GlobalSplit.cpp
-  GlobalMergeFunctions.cpp
   HotColdSplitting.cpp
   IPO.cpp
   IROutliner.cpp
@@ -61,7 +60,6 @@ add_llvm_component_library(LLVMipo
   Analysis
   BitReader
   BitWriter
-  CGData
   Core
   FrontendOpenMP
   InstCombine
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll b/llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll
index 4f5f5c0b5b26d9..660ffe6109948f 100644
--- a/llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll
+++ b/llvm/test/ThinLTO/AArch64/cgdata-merge-local.ll
@@ -1,36 +1,51 @@
 ; This test checks if two similar functions, f1 and f2, can be merged locally within a single module
 ; while parameterizing a difference in their global variables, g1 and g2.
 ; To achieve this, we create two instances of the global merging function, f1.Tgm and f2.Tgm,
-; which are tail-called from thunks g1 and g2 respectively.
+; which are tail-called from thunks f1 and f2 respectively.
 ; These identical functions, f1.Tgm and f2.Tgm, will be folded by the linker via Identical Code Folding (IFC).
 
-; RUN: opt -module-summary -module-hash %s -o %t
+; RUN: opt -S --passes=global-merge-func %s | FileCheck %s
 
-; RUN: llvm-lto2 run -enable-global-merge-func=false %t -o %tout-nomerge \
-; RUN:    -r %t,_f1,px \
-; RUN:    -r %t,_f2,px \
-; RUN:    -r %t,_g,l -r %t,_g1,l -r %t,_g2,l
-; RUN: llvm-nm %tout-nomerge.1 | FileCheck %s --check-prefix=NOMERGE
-; RUN: llvm-lto2 run -enable-global-merge-func=true %t -o %tout-merge \
-; RUN:    -r %t,_f1,px \
-; RUN:    -r %t,_f2,px \
-; RUN:    -r %t,_g,l -r %t,_g1,l -r %t,_g2,l
-; RUN: llvm-nm %tout-merge.1 | FileCheck %s --check-prefix=GLOBALMERGE
-; RUN: llvm-objdump -d %tout-merge.1 | FileCheck %s --check-prefix=THUNK
+; A merging instance is created with additional parameter.
+; CHECK: define internal i32 @f1.Tgm(i32 %0, ptr %1)
+; CHECK-NEXT: entry:
+; CHECK-NEXT:  %idxprom = sext i32 %0 to i64
+; CHECK-NEXT:  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+; CHECK-NEXT:  %2 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  %3 = load volatile i32, ptr %1, align 4
+; CHECK-NEXT:  %mul = mul nsw i32 %3, %2
+; CHECK-NEXT:  %add = add nsw i32 %mul, 1
+; CHECK-NEXT:  ret i32 %add
+
+; The original function becomes a thunk passing g1.
+; CHECK: define i32 @f1(i32 %a)
+; CHECK-NEXT:  %1 = tail call i32 @f1.Tgm(i32 %a, ptr @g1)
+; CHECK-NEXT:  ret i32 %1
+
+; A same sequence is produced for f2.Tgm.
+; CHECK: define internal i32 @f2.Tgm(i32 %0, ptr %1)
+; CHECK-NEXT: entry:
+; CHECK-NEXT:  %idxprom = sext i32 %0 to i64
+; CHECK-NEXT:  %arrayidx = getelementptr inbounds [0 x i32], ptr @g, i64 0, i64 %idxprom
+; CHECK-NEXT:  %2 = load i32, ptr %arrayidx, align 4
+; CHECK-NEXT:  %3 = load volatile i32, ptr %1, align 4
+; CHECK-NEXT:  %mul = mul nsw i32 %3, %2
+; CHECK-NEXT:  %add = add nsw i32 %mul, 1
+; CHECK-NEXT:  ret i32 %add
+
+; The original function becomes a thunk passing g2.
+; CHECK: define i32 @f2(i32 %a)
+; CHECK-NEXT:  %1 = tail call i32 @f2.Tgm(i32 %a, ptr @g2)
+; CHECK-NEXT:  ret i32 %1
+
+; RUN: llc -enable-global-merge-func=true < %s | FileCheck %s --check-prefix=MERGE
+; RUN: llc -enable-global-merge-func=false < %s | FileCheck %s --check-prefix=NOMERGE
+
+; MERGE: _f1.Tgm
+; MERGE: _f2.Tgm
 
 ; NOMERGE-NOT: _f1.Tgm
-; GLOBALMERGE: _f1.Tgm
-; GLOBALMERGE: _f2.Tgm
-
-; THUNK: <_f1>:
-; THUNK-NEXT: adrp x1,
-; THUNK-NEXT: ldr x1, [x1]
-; THUNK-NEXT: b
-
-; THUNK: <_f2>:
-; THUNK-NEXT: adrp x1,
-; THUNK-NEXT: ldr x1, [x1]
-; THUNK-NEXT: b
+; NOMERGE-NOT: _f2.Tgm
 
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-unknown-ios12.0.0"
diff --git a/llvm/tools/llvm-lto2/CMakeLists.txt b/llvm/tools/llvm-lto2/CMakeLists.txt
index 6ddccc7f17442f..3b4644d6e27715 100644
--- a/llvm/tools/llvm-lto2/CMakeLists.txt
+++ b/llvm/tools/llvm-lto2/CMakeLists.txt
@@ -6,7 +6,6 @@ set(LLVM_LINK_COMPONENTS
   BitReader
   CodeGen
   Core
-  IPO
   Linker
   LTO
   MC
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index 0dc6623552a4bd..d4f022ef021a44 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -28,7 +28,6 @@
 #include "llvm/Support/PluginLoader.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Threading.h"
-#include "llvm/Transforms/IPO.h"
 #include <atomic>
 
 using namespace llvm;
@@ -196,7 +195,6 @@ static cl::opt<bool> TryUseNewDbgInfoFormat(
 extern cl::opt<bool> UseNewDbgInfoFormat;
 extern cl::opt<cl::boolOrDefault> LoadBitcodeIntoNewDbgInfoFormat;
 extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
-extern cl::opt<bool> EnableGlobalMergeFunc;
 
 static void check(Error E, std::string Msg) {
   if (!E)
@@ -376,10 +374,6 @@ static int run(int argc, char **argv) {
     if (DI.getSeverity() == DS_Error)
       HasErrors = true;
   };
-  Conf.PreCodeGenPassesHook = [](legacy::PassManager &pm) {
-    if (EnableGlobalMergeFunc)
-      pm.add(createGlobalMergeFuncPass());
-  };
 
   LTO::LTOKind LTOMode = LTO::LTOK_Default;
 



More information about the llvm-commits mailing list