[clang] [lld] [llvm] Stablefunctest2 (PR #109522)

Sat Sep 21 07:48:40 PDT 2024

https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/109522

>From 5bf1d07adc357d2f8497e13857dee544fd0a377f Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 26 Apr 2024 20:02:52 -0700
Subject: [PATCH 01/10] [ThinLTO][NFC] Prep for two-codegen rounds

---
 clang/lib/CodeGen/BackendUtil.cpp  |  8 ++--
 llvm/include/llvm/LTO/LTOBackend.h |  1 +
 llvm/lib/LTO/LTO.cpp               | 75 ++++++++++++++++--------------
 llvm/lib/LTO/LTOBackend.cpp        |  6 ++-
 4 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index fa49763e312f13..5f32d086904ea2 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1320,10 +1320,10 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E =
-          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
-                      /* ModuleMap */ nullptr, CGOpts.CmdArgs)) {
+  if (Error E = thinBackend(
+          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+          ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+          /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index de89f4bb10dff2..8516398510d4b8 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> *ModuleMap,
+                  bool CodeGenOnly,
                   const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index a88124dacfaefd..f4c25f80811a85 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1473,7 +1473,8 @@ class InProcessThinBackend : public ThinBackendProc {
         return MOrErr.takeError();
 
       return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
-                         ImportList, DefinedGlobals, &ModuleMap);
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         Conf.CodeGenOnly);
     };
 
     auto ModuleID = BM.getModuleIdentifier();
@@ -1839,45 +1840,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   TimeTraceScopeExit.release();
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  auto ProcessOneModule = [&](int I) -> Error {
-    auto &Mod = *(ModuleMap.begin() + I);
-    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
-    // combined module and parallel code generation partitions.
-    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
-                              Mod.second, ImportLists[Mod.first],
-                              ExportLists[Mod.first], ResolvedODR[Mod.first],
-                              ThinLTO.ModuleMap);
+  auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error {
+    auto ProcessOneModule = [&](int I) -> Error {
+      auto &Mod = *(ModuleMap.begin() + I);
+      // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+      // combined module and parallel code generation partitions.
+      return BackendProcess->start(
+          RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second,
+          ImportLists[Mod.first], ExportLists[Mod.first],
+          ResolvedODR[Mod.first], ThinLTO.ModuleMap);
+    };
+
+    if (BackendProcess->getThreadCount() == 1) {
+      // Process the modules in the order they were provided on the
+      // command-line. It is important for this codepath to be used for
+      // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
+      // ThinLTO objects in the same order as the inputs, which otherwise would
+      // affect the final link order.
+      for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+        if (Error E = ProcessOneModule(I))
+          return E;
+    } else {
+      // When executing in parallel, process largest bitsize modules first to
+      // improve parallelism, and avoid starving the thread pool near the end.
+      // This saves about 15 sec on a 36-core machine while link `clang.exe`
+      // (out of 100 sec).
+      std::vector<BitcodeModule *> ModulesVec;
+      ModulesVec.reserve(ModuleMap.size());
+      for (auto &Mod : ModuleMap)
+        ModulesVec.push_back(&Mod.second);
+      for (int I : generateModulesOrdering(ModulesVec))
+        if (Error E = ProcessOneModule(I))
+          return E;
+    }
+    return BackendProcess->wait();
   };
 
-  if (BackendProc->getThreadCount() == 1) {
-    // Process the modules in the order they were provided on the command-line.
-    // It is important for this codepath to be used for WriteIndexesThinBackend,
-    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
-    // order as the inputs, which otherwise would affect the final link order.
-    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
-      if (Error E = ProcessOneModule(I))
-        return E;
-  } else {
-    // When executing in parallel, process largest bitsize modules first to
-    // improve parallelism, and avoid starving the thread pool near the end.
-    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
-    // of 100 sec).
-    std::vector<BitcodeModule *> ModulesVec;
-    ModulesVec.reserve(ModuleMap.size());
-    for (auto &Mod : ModuleMap)
-      ModulesVec.push_back(&Mod.second);
-    for (int I : generateModulesOrdering(ModulesVec))
-      if (Error E = ProcessOneModule(I))
-        return E;
-  }
-  return BackendProc->wait();
+  std::unique_ptr<ThinBackendProc> BackendProc =
+      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                      AddStream, Cache);
+  return RunBackends(BackendProc.get());
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 4e58cd369c3ac9..880567989baffb 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -565,7 +565,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
-                       const std::vector<uint8_t> &CmdArgs) {
+                       bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -586,7 +586,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   Mod.setPartialSampleProfileRatio(CombinedIndex);
 
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
-  if (Conf.CodeGenOnly) {
+  if (CodeGenOnly) {
+    // If CodeGenOnly is set, we only perform code generation and skip
+    // optimization.
     codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }

>From 4463d99f6f4aa2b1ccfb6a4290c600177726d392 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 13 Sep 2024 08:51:00 -0700
Subject: [PATCH 02/10] [CGData][ThinLTO] Global Outlining with Two-CodeGen
 Rounds

---
 llvm/include/llvm/CGData/CodeGenData.h        |  16 +++
 llvm/lib/CGData/CodeGenData.cpp               |  81 +++++++++++++-
 llvm/lib/LTO/CMakeLists.txt                   |   1 +
 llvm/lib/LTO/LTO.cpp                          | 103 +++++++++++++++++-
 llvm/lib/LTO/LTOBackend.cpp                   |  11 ++
 .../test/ThinLTO/AArch64/cgdata-two-rounds.ll |  94 ++++++++++++++++
 llvm/test/ThinLTO/AArch64/lit.local.cfg       |   2 +
 7 files changed, 302 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 84133a433170fe..1e1afe99327650 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
+/// Initialize the two-codegen rounds.
+void initializeTwoCodegenRounds();
+
+/// Save the current module before the first codegen round.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
+
+/// Load the current module before the second codegen round.
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context);
+
+/// Merge the codegen data from the input files in scratch vector in ThinLTO
+/// two-codegen rounds.
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
+
 void warn(Error E, StringRef Whence = "");
 void warn(Twine Message, std::string Whence = "", std::string Hint = "");
 
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 55d2504231c744..ff8e5dd7c75790 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/WithColor.h"
 
 #define DEBUG_TYPE "cg-data"
@@ -30,6 +31,14 @@ cl::opt<bool>
 cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+cl::opt<bool> CodeGenDataThinLTOTwoRounds(
+    "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
+    cl::desc("Enable two-round ThinLTO code generation. The first round "
+             "emits codegen data, while the second round uses the emitted "
+             "codegen data for further optimizations."));
+
+// Path to where the optimized bitcodes are saved and restored for ThinLTO.
+static SmallString<128> CodeGenDataThinLTOTwoRoundsPath;
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
@@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
     Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
-    if (CodeGenDataGenerate)
+    if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds)
       Instance->EmitCGData = true;
     else if (!CodeGenDataUsePath.empty()) {
       // Initialize the global CGData if the input file name is given.
@@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) {
   }
 }
 
+static std::string getPath(StringRef Dir, unsigned Task) {
+  return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+}
+
+void initializeTwoCodegenRounds() {
+  assert(CodeGenDataThinLTOTwoRounds);
+  if (auto EC = llvm::sys::fs::createUniqueDirectory(
+          "cgdata", CodeGenDataThinLTOTwoRoundsPath))
+    report_fatal_error(Twine("Failed to create directory: ") + EC.message());
+}
+
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to save optimized bitcode: " + EC.message());
+  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+}
+
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  auto FileOrError = MemoryBuffer::getFile(Path);
+  if (auto EC = FileOrError.getError())
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to load optimized bitcode: " + EC.message());
+
+  std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+  auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+  if (!RestoredModule)
+    report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
+                       Path + "\n");
+
+  // Restore the original module identifier.
+  (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
+  return std::move(*RestoredModule);
+}
+
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
+
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &InputFile : *(InputFiles)) {
+    if (InputFile.empty())
+      continue;
+    StringRef File = StringRef(InputFile.data(), InputFile.size());
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+        File, "in-memory object file", /*RequiresNullTerminator=*/false);
+    Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
+        object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
+    if (!BinOrErr)
+      return BinOrErr.takeError();
+
+    std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
+                                                        GlobalOutlineRecord))
+      return E;
+  }
+
+  if (!GlobalOutlineRecord.empty())
+    cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+
+  return Error::success();
+}
+
 } // end namespace cgdata
 
 } // end namespace llvm
diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt
index 69ff08e1f374c4..057d73b6349cf1 100644
--- a/llvm/lib/LTO/CMakeLists.txt
+++ b/llvm/lib/LTO/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO
   BinaryFormat
   BitReader
   BitWriter
+  CGData
   CodeGen
   CodeGenTypes
   Core
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index f4c25f80811a85..945f8c859365ea 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -70,6 +71,8 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
 namespace llvm {
 /// Enable global value internalization in LTO.
 cl::opt<bool> EnableLTOInternalization(
@@ -1458,7 +1461,7 @@ class InProcessThinBackend : public ThinBackendProc {
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
   }
 
-  Error runThinLTOBackendThread(
+  virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
       ModuleSummaryIndex &CombinedIndex,
       const FunctionImporter::ImportMapTy &ImportList,
@@ -1559,6 +1562,60 @@ class InProcessThinBackend : public ThinBackendProc {
     return BackendThreadPool.getMaxConcurrency();
   }
 };
+
+/// This Backend will run ThinBackend process but throw away all the output from
+/// the codegen. This class facilitates the first codegen round.
+class NoOutputThinBackend : public InProcessThinBackend {
+public:
+  NoOutputThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
+      : InProcessThinBackend(
+            Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
+            // Allocate a scratch buffer for each task to write output to.
+            [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
+              return std::make_unique<CachedFileStream>(
+                  std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
+            },
+            FileCache(), nullptr, false, false),
+        Scratch(std::move(Scratch)) {}
+
+  /// Scratch space for writing output during the codegen.
+  std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
+};
+
+/// This Backend performs codegen on bitcode that was previously saved after
+/// going through optimization. This class facilitates the second codegen round.
+class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+public:
+  OptimizedBitcodeThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn AddStream)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, AddStream, FileCache(),
+                             nullptr, false, false) {}
+
+  virtual Error runThinLTOBackendThread(
+      AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+      ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    LTOLLVMContext BackendContext(Conf);
+    std::unique_ptr<Module> LoadedModule =
+        cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
+
+    return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+                       ImportList, DefinedGlobals, &ModuleMap,
+                       /*CodeGenOnly=*/true);
+  }
+};
 } // end anonymous namespace
 
 ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1879,10 +1936,46 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     return BackendProcess->wait();
   };
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-  return RunBackends(BackendProc.get());
+  if (!CodeGenDataThinLTOTwoRounds) {
+    std::unique_ptr<ThinBackendProc> BackendProc =
+        ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                        AddStream, Cache);
+    return RunBackends(BackendProc.get());
+  }
+
+  // Perform two rounds of code generation for ThinLTO:
+  // 1. First round: Run optimization and code generation with a scratch output.
+  // 2. Merge codegen data extracted from the scratch output.
+  // 3. Second round: Run code generation again using the merged data.
+  LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
+
+  // Initialize a temporary path to store and retrieve optimized IRs for
+  // two-round code generation.
+  cgdata::initializeTwoCodegenRounds();
+
+  // Create a scratch output to hold intermediate results.
+  auto Outputs =
+      std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
+  auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+      Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+      ModuleToDefinedGVSummaries, std::move(Outputs));
+  // First round: Run optimization and code generation with a scratch output.
+  // Before code generation, serialize modules.
+  if (Error E = RunBackends(FirstRoundLTO.get()))
+    return E;
+
+  // Merge codegen data extracted from the scratch output.
+  if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
+    return E;
+
+  // Second round: Run code generation by reading IRs.
+  std::unique_ptr<ThinBackendProc> SecondRoundLTO =
+      std::make_unique<OptimizedBitcodeThinBackend>(
+          Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+          ModuleToDefinedGVSummaries, AddStream);
+  Error E = RunBackends(SecondRoundLTO.get());
+
+  return E;
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 880567989baffb..d198e8e5102009 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged(
     cl::desc("Assume the input has already undergone ThinLTO function "
              "importing and the other pre-optimization pipeline changes."));
 
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
 namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
@@ -599,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   auto OptimizeAndCodegen =
       [&](Module &Mod, TargetMachine *TM,
           std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) {
+        // Perform optimization and code generation for ThinLTO.
         if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
                  /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
                  CmdArgs))
           return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
+        // Save the current module before the first codegen round.
+        // Note that the second codegen round runs only `codegen()` without
+        // running `opt()`. We're not reaching here as it's bailed out earlier
+        // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`.
+        if (CodeGenDataThinLTOTwoRounds)
+          cgdata::saveModuleForTwoRounds(Mod, Task);
+
         codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       };
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
new file mode 100644
index 00000000000000..0e082cf4e55e54
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
@@ -0,0 +1,94 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; by running two codegen rounds.
+
+; RUN: split-file %s %t
+
+; Verify each outlining instance is singleton with the global outlining for thinlto.
+; They will be identical, which can be folded by the linker with ICF.
+; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc
+; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1
+; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  b
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+
+; Now add a lto module to the above thinlto modules.
+; Verify the lto module is optimized independent of the global outlining for thinlto.
+; RUN: opt %t/lto.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; lto.ll will have one outlining instance within the lto module itself (no global outlining).
+; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0
+; LTO-0: _OUTLINED_FUNCTION{{.*}}>:
+; LTO-0-NEXT:  mov
+; LTO-0-NEXT:  b
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2
+
+;--- thin-one.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- lto.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 10, i32 30, i32 2);
+  ret i32 %1
+}
+define i32 @f5() minsize {
+  %1 = call i32 @g(i32 20, i32 40, i32 2);
+  ret i32 %1
+}
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 50, i32 60, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg
new file mode 100644
index 00000000000000..10d4a0e953ed47
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True

>From cf61a07683a1dce042b73cdd70ecac1c8d1074d4 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Tue, 17 Sep 2024 18:07:49 -0700
Subject: [PATCH 03/10] Address comments from ellishg

---
 llvm/include/llvm/CGData/CodeGenData.h | 7 ++++---
 llvm/include/llvm/LTO/LTOBackend.h     | 3 ++-
 llvm/lib/CGData/CodeGenData.cpp        | 4 +++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 1e1afe99327650..72b52e6e9b8fd1 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -164,13 +164,14 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
-/// Initialize the two-codegen rounds.
 void initializeTwoCodegenRounds();
 
-/// Save the current module before the first codegen round.
+/// Save \p TheModule before the first codegen round.
+/// \p Task represents the partition number in the parallel code generation
+/// process.
 void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
 
-/// Load the current module before the second codegen round.
+/// Load the optimized module before the second codegen round.
 std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
                                                unsigned Task,
                                                LLVMContext &Context);
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index 8516398510d4b8..098c0491dfe70a 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -50,7 +50,8 @@ Error backend(const Config &C, AddStreamFn AddStream,
 /// already been mapped to memory and the corresponding BitcodeModule objects
 /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will
 /// be mapped to memory on demand and at any given time during importing, only
-/// one source module will be kept open at the most.
+/// one source module will be kept open at the most. If \p CodeGenOnly is true,
+/// the backend will skip optimization and only perform code generation.
 Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   Module &M, const ModuleSummaryIndex &CombinedIndex,
                   const FunctionImporter::ImportMapTy &ImportList,
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index ff8e5dd7c75790..58b92b7262957a 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -225,7 +225,9 @@ void warn(Error E, StringRef Whence) {
 }
 
 static std::string getPath(StringRef Dir, unsigned Task) {
-  return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+  llvm::SmallString<128> Path(Dir);
+  llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc");
+  return std::string(Path);
 }
 
 void initializeTwoCodegenRounds() {

>From 0df2785f136efc3b7a74b8f5c520c6ce19d0016a Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Tue, 17 Sep 2024 23:37:51 -0700
Subject: [PATCH 04/10] Address comments from NuriAmari

---
 llvm/lib/CGData/CodeGenData.cpp |  4 ++--
 llvm/lib/LTO/LTO.cpp            | 33 +++++++++++++++++++++------------
 llvm/lib/LTO/LTOBackend.cpp     |  2 +-
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 58b92b7262957a..4e21045a67cba6 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -245,7 +245,7 @@ void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
   if (EC)
     report_fatal_error(Twine("Failed to open ") + Path +
                        " to save optimized bitcode: " + EC.message());
-  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+  WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true);
 }
 
 std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
@@ -259,7 +259,7 @@ std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
                        " to load optimized bitcode: " + EC.message());
 
   std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
-  auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+  auto RestoredModule = parseBitcodeFile(*FileBuffer, Context);
   if (!RestoredModule)
     report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
                        Path + "\n");
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 945f8c859365ea..b51b908fb28760 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1563,11 +1563,14 @@ class InProcessThinBackend : public ThinBackendProc {
   }
 };
 
-/// This Backend will run ThinBackend process but throw away all the output from
-/// the codegen. This class facilitates the first codegen round.
-class NoOutputThinBackend : public InProcessThinBackend {
+/// This backend is utilized in the first round of a two-codegen round process.
+/// It first saves optimized bitcode files to disk before the codegen process
+/// begins. After codegen, it stores the resulting object files in a scratch
+/// buffer. Note the codegen data stored in the scratch buffer will be extracted
+/// and merged in the subsequent step.
+class FirstRoundThinBackend : public InProcessThinBackend {
 public:
-  NoOutputThinBackend(
+  FirstRoundThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
@@ -1579,25 +1582,31 @@ class NoOutputThinBackend : public InProcessThinBackend {
               return std::make_unique<CachedFileStream>(
                   std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
             },
-            FileCache(), nullptr, false, false),
+            FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false,
+            /*ShouldEmitImportsFiles=*/false),
         Scratch(std::move(Scratch)) {}
 
   /// Scratch space for writing output during the codegen.
   std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
 };
 
-/// This Backend performs codegen on bitcode that was previously saved after
-/// going through optimization. This class facilitates the second codegen round.
-class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+/// This backend operates in the second round of a two-codegen round process.
+/// It starts by reading the optimized bitcode files that were saved during the
+/// first round. The backend then executes the codegen only to further optimize
+/// the code, utilizing the codegen data merged from the first round. Finally,
+/// it writes the resulting object files as usual.
+class SecondRoundThinBackend : public InProcessThinBackend {
 public:
-  OptimizedBitcodeThinBackend(
+  SecondRoundThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       AddStreamFn AddStream)
       : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
                              ModuleToDefinedGVSummaries, AddStream, FileCache(),
-                             nullptr, false, false) {}
+                             /*OnWrite=*/nullptr,
+                             /*ShouldEmitIndexFiles=*/false,
+                             /*ShouldEmitImportsFiles=*/false) {}
 
   virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
@@ -1956,7 +1965,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
   // Create a scratch output to hold intermediate results.
   auto Outputs =
       std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
-  auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+  auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>(
       Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
       ModuleToDefinedGVSummaries, std::move(Outputs));
   // First round: Run optimization and code generation with a scratch output.
@@ -1970,7 +1979,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   // Second round: Run code generation by reading IRs.
   std::unique_ptr<ThinBackendProc> SecondRoundLTO =
-      std::make_unique<OptimizedBitcodeThinBackend>(
+      std::make_unique<SecondRoundThinBackend>(
           Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
           ModuleToDefinedGVSummaries, AddStream);
   Error E = RunBackends(SecondRoundLTO.get());
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index d198e8e5102009..cf69f4add53a79 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -611,7 +611,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
         // Save the current module before the first codegen round.
         // Note that the second codegen round runs only `codegen()` without
         // running `opt()`. We're not reaching here as it's bailed out earlier
-        // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`.
+        // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
         if (CodeGenDataThinLTOTwoRounds)
           cgdata::saveModuleForTwoRounds(Mod, Task);
 

>From 2facb52dfd660affe4eccb50352b62313a2f2965 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Thu, 29 Aug 2024 13:12:24 -0700
Subject: [PATCH 05/10] Refactor Structual Hash

---
 llvm/include/llvm/IR/StructuralHash.h         |  59 +++-
 llvm/lib/IR/StructuralHash.cpp                | 288 ++++++++++++++----
 .../MergeFunc/call-and-invoke-with-ranges.ll  |  16 +-
 llvm/unittests/IR/StructuralHashTest.cpp      |  55 ++++
 4 files changed, 356 insertions(+), 62 deletions(-)

diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
index 57fb45db849110..d5fb0017cd3ce0 100644
--- a/llvm/include/llvm/IR/StructuralHash.h
+++ b/llvm/include/llvm/IR/StructuralHash.h
@@ -14,6 +14,10 @@
 #ifndef LLVM_IR_STRUCTURALHASH_H
 #define LLVM_IR_STRUCTURALHASH_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/IR/Instruction.h"
 #include <cstdint>
 
 namespace llvm {
@@ -21,7 +25,7 @@ namespace llvm {
 class Function;
 class Module;
 
-using IRHash = uint64_t;
+using IRHash = stable_hash;
 
 /// Returns a hash of the function \p F.
 /// \param F The function to hash.
@@ -36,6 +40,59 @@ IRHash StructuralHash(const Function &F, bool DetailedHash = false);
 /// composed the module hash.
 IRHash StructuralHash(const Module &M, bool DetailedHash = false);
 
+/// The pair of an instruction index and a operand index.
+using IndexPair = std::pair<unsigned, unsigned>;
+
+/// A map from an instruction index to an instruction pointer.
+using IndexInstrMap = MapVector<unsigned, Instruction *>;
+
+/// A map from an IndexPair to a stable_hash.
+using IndexOperandHashMapType = DenseMap<IndexPair, stable_hash>;
+
+/// A function that takes an instruction and an operand index and returns true
+/// if the operand should be ignored in the function hash computation.
+using IgnoreOperandFunc = std::function<bool(const Instruction *, unsigned)>;
+
+struct FunctionHashInfo {
+  /// A hash value representing the structural content of the function
+  IRHash FunctionHash;
+  /// A mapping from instruction indices to instruction pointers
+  std::unique_ptr<IndexInstrMap> IndexInstruction;
+  /// A mapping from pairs of instruction indices and operand indices
+  /// to the hashes of the operands. This can be used to analyze or
+  /// reconstruct the differences in ignored operands
+  std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap;
+
+  FunctionHashInfo(IRHash FuntionHash,
+                   std::unique_ptr<IndexInstrMap> IndexInstruction,
+                   std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap)
+      : FunctionHash(FuntionHash),
+        IndexInstruction(std::move(IndexInstruction)),
+        IndexOperandHashMap(std::move(IndexOperandHashMap)) {}
+#if 0 // TODO: delete
+  // Disable copy operations
+  FunctionHashInfo(const FunctionHashInfo &) = delete;
+  FunctionHashInfo &operator=(const FunctionHashInfo &) = delete;
+  // Enable move operations
+  FunctionHashInfo(FunctionHashInfo &&) noexcept = default;
+  FunctionHashInfo &operator=(FunctionHashInfo &&) noexcept = default;
+#endif
+};
+
+/// Computes a structural hash of a given function, considering the structure
+/// and content of the function's instructions while allowing for selective
+/// ignoring of certain operands based on custom criteria. This hash can be used
+/// to identify functions that are structurally similar or identical, which is
+/// useful in optimizations, deduplication, or analysis tasks.
+/// \param F The function to hash.
+/// \param IgnoreOp A callable that takes an instruction and one of its operands
+/// and returns true if an operand should be ignored in the hash computation.
+/// for the hash computation.
+/// \return A FunctionHashInfo structure
+
+FunctionHashInfo StructuralHashWithDifferences(const Function &F,
+                                               IgnoreOperandFunc IgnoreOp);
+
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
index fb4f33a021a96b..db23786036b4b8 100644
--- a/llvm/lib/IR/StructuralHash.cpp
+++ b/llvm/lib/IR/StructuralHash.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/StructuralHash.h"
-#include "llvm/ADT/Hashing.h"
+#include "llvm/CodeGen/MachineStableHash.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/InstrTypes.h"
@@ -24,61 +24,218 @@ namespace {
 // by the MergeFunctions pass.
 
 class StructuralHashImpl {
-  uint64_t Hash = 4;
+  stable_hash Hash = 4;
 
-  void hash(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
+  bool DetailedHash;
 
-  // This will produce different values on 32-bit and 64-bit systens as
-  // hash_combine returns a size_t. However, this is only used for
-  // detailed hashing which, in-tree, only needs to distinguish between
-  // differences in functions.
-  template <typename T> void hashArbitaryType(const T &V) {
-    hash(hash_combine(V));
-  }
+  IgnoreOperandFunc IgnoreOp = nullptr;
+  std::unique_ptr<IndexInstrMap> IndexInstruction = nullptr;
+  std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap = nullptr;
+
+  DenseMap<const Value *, int> ValueToId;
 
-  void hashType(Type *ValueType) {
-    hash(ValueType->getTypeID());
-    if (ValueType->isIntegerTy())
-      hash(ValueType->getIntegerBitWidth());
+  stable_hash hashType(Type *Ty) {
+    SmallVector<stable_hash> Hashes;
+    Hashes.emplace_back(Ty->getTypeID());
+    if (Ty->isIntegerTy())
+      Hashes.emplace_back(Ty->getIntegerBitWidth());
+    return stable_hash_combine(Hashes);
   }
 
 public:
-  StructuralHashImpl() = default;
-
-  void updateOperand(Value *Operand) {
-    hashType(Operand->getType());
-
-    // The cases enumerated below are not exhaustive and are only aimed to
-    // get decent coverage over the function.
-    if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(Operand)) {
-      hashArbitaryType(ConstInt->getValue());
-    } else if (ConstantFP *ConstFP = dyn_cast<ConstantFP>(Operand)) {
-      hashArbitaryType(ConstFP->getValue());
-    } else if (Argument *Arg = dyn_cast<Argument>(Operand)) {
-      hash(Arg->getArgNo());
-    } else if (Function *Func = dyn_cast<Function>(Operand)) {
-      // Hashing the name will be deterministic as LLVM's hashing infrastructure
-      // has explicit support for hashing strings and will not simply hash
-      // the pointer.
-      hashArbitaryType(Func->getName());
+  StructuralHashImpl() = delete;
+  explicit StructuralHashImpl(bool DetailedHash,
+                              IgnoreOperandFunc IgnoreOp = nullptr)
+      : DetailedHash(DetailedHash), IgnoreOp(IgnoreOp) {
+    if (IgnoreOp) {
+      IndexInstruction = std::make_unique<IndexInstrMap>();
+      IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
+    }
+  }
+
+  // hash_value for APInt should be stable
+  stable_hash hashAPInt(const APInt &I) { return hash_value(I); }
+
+  stable_hash hashAPFloat(const APFloat &F) {
+    SmallVector<stable_hash> Hashes;
+    const fltSemantics &S = F.getSemantics();
+    Hashes.emplace_back(APFloat::semanticsPrecision(S));
+    Hashes.emplace_back(APFloat::semanticsMaxExponent(S));
+    Hashes.emplace_back(APFloat::semanticsMinExponent(S));
+    Hashes.emplace_back(APFloat::semanticsSizeInBits(S));
+    Hashes.emplace_back(hashAPInt(F.bitcastToAPInt()));
+    return stable_hash_combine(Hashes);
+  }
+
+  stable_hash hashGlobalValue(const GlobalValue *GV) {
+    if (!GV->hasName())
+      return 0;
+    return stable_hash_name(GV->getName());
+  }
+
+  stable_hash hashConstant(const Constant *C) {
+    SmallVector<stable_hash> Hashes;
+
+    Type *Ty = C->getType();
+    Hashes.emplace_back(hashType(Ty));
+
+    if (C->isNullValue()) {
+      Hashes.emplace_back(static_cast<stable_hash>('N'));
+      return stable_hash_combine(Hashes);
+    }
+
+    auto *G = dyn_cast<GlobalValue>(C);
+    if (G) {
+      Hashes.emplace_back(hashGlobalValue(G));
+      return stable_hash_combine(Hashes);
+    }
+
+    if (const auto *Seq = dyn_cast<ConstantDataSequential>(C)) {
+      Hashes.emplace_back(xxh3_64bits(Seq->getRawDataValues()));
+      return stable_hash_combine(Hashes);
+    }
+
+    switch (C->getValueID()) {
+    case Value::UndefValueVal:
+    case Value::PoisonValueVal:
+    case Value::ConstantTokenNoneVal: {
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantIntVal: {
+      const APInt &Int = cast<ConstantInt>(C)->getValue();
+      Hashes.emplace_back(hashAPInt(Int));
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantFPVal: {
+      const APFloat &APF = cast<ConstantFP>(C)->getValueAPF();
+      Hashes.emplace_back(hashAPFloat(APF));
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantArrayVal: {
+      const ConstantArray *A = cast<ConstantArray>(C);
+      uint64_t NumElements = cast<ArrayType>(Ty)->getNumElements();
+      Hashes.emplace_back(NumElements);
+      for (uint64_t i = 0; i < NumElements; ++i) {
+        auto H = hashConstant(cast<Constant>(A->getOperand(i)));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantStructVal: {
+      const ConstantStruct *S = cast<ConstantStruct>(C);
+      unsigned NumElements = cast<StructType>(Ty)->getNumElements();
+      Hashes.emplace_back(NumElements);
+      for (unsigned i = 0; i != NumElements; ++i) {
+        auto H = hashConstant(cast<Constant>(S->getOperand(i)));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantVectorVal: {
+      const ConstantVector *V = cast<ConstantVector>(C);
+      unsigned NumElements = cast<FixedVectorType>(Ty)->getNumElements();
+      Hashes.emplace_back(NumElements);
+      for (unsigned i = 0; i != NumElements; ++i) {
+        auto H = hashConstant(cast<Constant>(V->getOperand(i)));
+        Hashes.emplace_back(H);
+      }
+      return stable_hash_combine(Hashes);
+    }
+    case Value::ConstantExprVal: {
+      const ConstantExpr *E = cast<ConstantExpr>(C);
+      unsigned NumOperands = E->getNumOperands();
+      Hashes.emplace_back(NumOperands);
+      for (unsigned i = 0; i < NumOperands; ++i) {
+        auto H = hashConstant(cast<Constant>(E->getOperand(i)));
+        Hashes.emplace_back(H);
+      }
+      // TODO: GEPOperator
+      return stable_hash_combine(Hashes);
     }
+    case Value::BlockAddressVal: {
+      const BlockAddress *BA = cast<BlockAddress>(C);
+      auto H = hashGlobalValue(BA->getFunction());
+      Hashes.emplace_back(H);
+      // TODO: handle BBs in the same function. can we reference a block
+      // in another TU?
+      return stable_hash_combine(Hashes);
+    }
+    case Value::DSOLocalEquivalentVal: {
+      const auto *Equiv = cast<DSOLocalEquivalent>(C);
+      auto H = hashGlobalValue(Equiv->getGlobalValue());
+      Hashes.emplace_back(H);
+      return stable_hash_combine(Hashes);
+    }
+    default: // Unknown constant, abort.
+      llvm_unreachable("Constant ValueID not recognized.");
+    }
+    return Hash;
+  }
+
+  /// Hash a value in order similar to FunctionCompartor::cmpValue().
+  /// If this is the first time the value are seen, it's added to the mapping
+  /// so that we can use its index for hash computation.
+  stable_hash hashValue(Value *V) {
+    SmallVector<stable_hash> Hashes;
+
+    // Check constant and return its hash.
+    const Constant *C = dyn_cast<Constant>(V);
+    if (C) {
+      Hashes.emplace_back(hashConstant(C));
+      return stable_hash_combine(Hashes);
+    }
+
+    // Hash argument number.
+    if (Argument *Arg = dyn_cast<Argument>(V))
+      Hashes.emplace_back(Arg->getArgNo());
+
+    // Get an index (an insertion order) for the non-constant value.
+    auto I = ValueToId.insert({V, ValueToId.size()});
+    Hashes.emplace_back(I.first->second);
+
+    return stable_hash_combine(Hashes);
+  }
+
+  stable_hash hashOperand(Value *Operand) {
+    SmallVector<stable_hash> Hashes;
+    Hashes.emplace_back(hashType(Operand->getType()));
+    Hashes.emplace_back(hashValue(Operand));
+
+    return stable_hash_combine(Hashes);
   }
 
-  void updateInstruction(const Instruction &Inst, bool DetailedHash) {
-    hash(Inst.getOpcode());
+  stable_hash hashInstruction(const Instruction &Inst) {
+    SmallVector<stable_hash> Hashes;
+    Hashes.emplace_back(Inst.getOpcode());
 
     if (!DetailedHash)
-      return;
+      return stable_hash_combine(Hashes);
 
-    hashType(Inst.getType());
+    Hashes.emplace_back(hashType(Inst.getType()));
 
     // Handle additional properties of specific instructions that cause
     // semantic differences in the IR.
+    // TODO: expand cmpOperations for different type of instructions
     if (const auto *ComparisonInstruction = dyn_cast<CmpInst>(&Inst))
-      hash(ComparisonInstruction->getPredicate());
+      Hashes.emplace_back(ComparisonInstruction->getPredicate());
+
+    unsigned InstIdx = 0;
+    if (IndexInstruction) {
+      InstIdx = IndexInstruction->size();
+      IndexInstruction->insert({InstIdx, const_cast<Instruction *>(&Inst)});
+    }
+
+    for (unsigned OpndIdx = 0; OpndIdx < Inst.getNumOperands(); ++OpndIdx) {
+      auto *Op = Inst.getOperand(OpndIdx);
+      auto OpndHash = hashOperand(Op);
+      if (IgnoreOp && IgnoreOp(&Inst, OpndIdx)) {
+        assert(IndexPairOpndHash);
+        IndexOperandHashMap->insert({{InstIdx, OpndIdx}, OpndHash});
+      } else
+        Hashes.emplace_back(OpndHash);
+    }
 
-    for (const auto &Op : Inst.operands())
-      updateOperand(Op);
+    return stable_hash_combine(Hashes);
   }
 
   // A function hash is calculated by considering only the number of arguments
@@ -97,15 +254,17 @@ class StructuralHashImpl {
   // expensive checks for pass modification status). When modifying this
   // function, most changes should be gated behind an option and enabled
   // selectively.
-  void update(const Function &F, bool DetailedHash) {
+  void update(const Function &F) {
     // Declarations don't affect analyses.
     if (F.isDeclaration())
       return;
 
-    hash(0x62642d6b6b2d6b72); // Function header
+    SmallVector<stable_hash> Hashes;
+    Hashes.emplace_back(Hash);
+    Hashes.emplace_back(0x62642d6b6b2d6b72); // Function header
 
-    hash(F.isVarArg());
-    hash(F.arg_size());
+    Hashes.emplace_back(F.isVarArg());
+    Hashes.emplace_back(F.arg_size());
 
     SmallVector<const BasicBlock *, 8> BBs;
     SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
@@ -121,14 +280,17 @@ class StructuralHashImpl {
       // This random value acts as a block header, as otherwise the partition of
       // opcodes into BBs wouldn't affect the hash, only the order of the
       // opcodes
-      hash(45798);
+      Hashes.emplace_back(45798);
       for (auto &Inst : *BB)
-        updateInstruction(Inst, DetailedHash);
+        Hashes.emplace_back(hashInstruction(Inst));
 
       for (const BasicBlock *Succ : successors(BB))
         if (VisitedBBs.insert(Succ).second)
           BBs.push_back(Succ);
     }
+
+    // Update the combined hash in place.
+    Hash = stable_hash_combine(Hashes);
   }
 
   void update(const GlobalVariable &GV) {
@@ -137,30 +299,50 @@ class StructuralHashImpl {
     // we ignore anything with the `.llvm` prefix
     if (GV.isDeclaration() || GV.getName().starts_with("llvm."))
       return;
-    hash(23456); // Global header
-    hash(GV.getValueType()->getTypeID());
+    SmallVector<stable_hash> Hashes;
+    Hashes.emplace_back(Hash);
+    Hashes.emplace_back(23456); // Global header
+    Hashes.emplace_back(GV.getValueType()->getTypeID());
+
+    // Update the combined hash in place.
+    Hash = stable_hash_combine(Hashes);
   }
 
-  void update(const Module &M, bool DetailedHash) {
+  void update(const Module &M) {
     for (const GlobalVariable &GV : M.globals())
       update(GV);
     for (const Function &F : M)
-      update(F, DetailedHash);
+      update(F);
   }
 
   uint64_t getHash() const { return Hash; }
+  std::unique_ptr<IndexInstrMap> getIndexInstrMap() {
+    return std::move(IndexInstruction);
+  }
+  std::unique_ptr<IndexOperandHashMapType> getIndexPairOpndHashMap() {
+    return std::move(IndexOperandHashMap);
+  }
 };
 
 } // namespace
 
 IRHash llvm::StructuralHash(const Function &F, bool DetailedHash) {
-  StructuralHashImpl H;
-  H.update(F, DetailedHash);
+  StructuralHashImpl H(DetailedHash);
+  H.update(F);
   return H.getHash();
 }
 
 IRHash llvm::StructuralHash(const Module &M, bool DetailedHash) {
-  StructuralHashImpl H;
-  H.update(M, DetailedHash);
+  StructuralHashImpl H(DetailedHash);
+  H.update(M);
   return H.getHash();
 }
+
+FunctionHashInfo
+llvm::StructuralHashWithDifferences(const Function &F,
+                                    IgnoreOperandFunc IgnoreOp) {
+  StructuralHashImpl H(/*DetailedHash=*/true, IgnoreOp);
+  H.update(F);
+  return FunctionHashInfo(H.getHash(), H.getIndexInstrMap(),
+                          H.getIndexPairOpndHashMap());
+}
diff --git a/llvm/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll b/llvm/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
index e7718ca84d3165..3c18a3f7bdefa3 100644
--- a/llvm/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
+++ b/llvm/test/Transforms/MergeFunc/call-and-invoke-with-ranges.ll
@@ -63,6 +63,14 @@ lpad:
   resume { ptr, i32 } zeroinitializer
 }
 
+define i8 @call_with_same_range() {
+; CHECK-LABEL: @call_with_same_range
+; CHECK: tail call i8 @call_with_range
+  bitcast i8 0 to i8
+  %out = call i8 @dummy(), !range !0
+  ret i8 %out
+}
+
 define i8 @invoke_with_same_range() personality ptr undef {
 ; CHECK-LABEL: @invoke_with_same_range()
 ; CHECK: tail call i8 @invoke_with_range()
@@ -76,14 +84,6 @@ lpad:
   resume { ptr, i32 } zeroinitializer
 }
 
-define i8 @call_with_same_range() {
-; CHECK-LABEL: @call_with_same_range
-; CHECK: tail call i8 @call_with_range
-  bitcast i8 0 to i8
-  %out = call i8 @dummy(), !range !0
-  ret i8 %out
-}
-
 
 declare i8 @dummy();
 declare i32 @__gxx_personality_v0(...)
diff --git a/llvm/unittests/IR/StructuralHashTest.cpp b/llvm/unittests/IR/StructuralHashTest.cpp
index 64e66aa5f97a6d..e9f18ed40bc65b 100644
--- a/llvm/unittests/IR/StructuralHashTest.cpp
+++ b/llvm/unittests/IR/StructuralHashTest.cpp
@@ -239,4 +239,59 @@ TEST(StructuralHashTest, ArgumentNumber) {
   EXPECT_EQ(StructuralHash(*M1), StructuralHash(*M2));
   EXPECT_NE(StructuralHash(*M1, true), StructuralHash(*M2, true));
 }
+
+TEST(StructuralHashTest, Differences) {
+  LLVMContext Ctx;
+  std::unique_ptr<Module> M1 = parseIR(Ctx, "define i64 @f(i64 %a) {\n"
+                                            "  %c = add i64 %a, 1\n"
+                                            "  %b = call i64 @f1(i64 %c)\n"
+                                            "  ret i64 %b\n"
+                                            "}\n"
+                                            "declare i64 @f1(i64)");
+  auto *F1 = M1->getFunction("f");
+  std::unique_ptr<Module> M2 = parseIR(Ctx, "define i64 @g(i64 %a) {\n"
+                                            "  %c = add i64 %a, 1\n"
+                                            "  %b = call i64 @f2(i64 %c)\n"
+                                            "  ret i64 %b\n"
+                                            "}\n"
+                                            "declare i64 @f2(i64)");
+  auto *F2 = M2->getFunction("g");
+
+  // They are originally different when not ignoring any operand.
+  EXPECT_NE(StructuralHash(*F1, true), StructuralHash(*F2, true));
+  EXPECT_NE(StructuralHashWithDifferences(*F1, nullptr).FunctionHash,
+            StructuralHashWithDifferences(*F2, nullptr).FunctionHash);
+
+  // When we ignore the call target f1 vs f2, they have the same hash.
+  auto IgnoreOp = [&](const Instruction *I, unsigned OpndIdx) {
+    return I->getOpcode() == Instruction::Call && OpndIdx == 1;
+  };
+  auto FuncHashInfo1 = StructuralHashWithDifferences(*F1, IgnoreOp);
+  auto FuncHashInfo2 = StructuralHashWithDifferences(*F2, IgnoreOp);
+  EXPECT_EQ(FuncHashInfo1.FunctionHash, FuncHashInfo2.FunctionHash);
+
+  // There are total 3 instructions.
+  EXPECT_EQ(FuncHashInfo1.IndexInstruction->size(), 3u);
+  EXPECT_EQ(FuncHashInfo2.IndexInstruction->size(), 3u);
+
+  // The only 1 operand (the call target) has been ignored.
+  EXPECT_EQ(FuncHashInfo1.IndexOperandHashMap->size(), 1u);
+  EXPECT_EQ(FuncHashInfo2.IndexOperandHashMap->size(), 1u);
+
+  // The index pair of instruction and operand (1, 1) is a key in the map.
+  ASSERT_TRUE(FuncHashInfo1.IndexOperandHashMap->count({1, 1}));
+  ASSERT_TRUE(FuncHashInfo2.IndexOperandHashMap->count({1, 1}));
+
+  // The indexed instruciton must be the call instruction as shown in the
+  // IgnoreOp above.
+  EXPECT_EQ(FuncHashInfo1.IndexInstruction->lookup(1)->getOpcode(),
+            Instruction::Call);
+  EXPECT_EQ(FuncHashInfo2.IndexInstruction->lookup(1)->getOpcode(),
+            Instruction::Call);
+
+  // The ignored operand hashes (for f1 vs. f2) are different.
+  EXPECT_NE(FuncHashInfo1.IndexOperandHashMap->lookup({1, 1}),
+            FuncHashInfo2.IndexOperandHashMap->lookup({1, 1}));
+}
+
 } // end anonymous namespace

>From 94a1822f100edbb8567849514308b4a539556f82 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 7 Sep 2024 22:48:17 -0700
Subject: [PATCH 06/10] [CGData] Stable Function Map

---
 llvm/include/llvm/CGData/StableFunctionMap.h  | 134 +++++++++++++
 .../llvm/CGData/StableFunctionMapRecord.h     |  61 ++++++
 llvm/lib/CGData/CMakeLists.txt                |   2 +
 llvm/lib/CGData/StableFunctionMap.cpp         |  86 +++++++++
 llvm/lib/CGData/StableFunctionMapRecord.cpp   | 182 ++++++++++++++++++
 llvm/unittests/CGData/CMakeLists.txt          |   2 +
 .../CGData/StableFunctionMapRecordTest.cpp    | 131 +++++++++++++
 .../CGData/StableFunctionMapTest.cpp          | 104 ++++++++++
 8 files changed, 702 insertions(+)
 create mode 100644 llvm/include/llvm/CGData/StableFunctionMap.h
 create mode 100644 llvm/include/llvm/CGData/StableFunctionMapRecord.h
 create mode 100644 llvm/lib/CGData/StableFunctionMap.cpp
 create mode 100644 llvm/lib/CGData/StableFunctionMapRecord.cpp
 create mode 100644 llvm/unittests/CGData/StableFunctionMapRecordTest.cpp
 create mode 100644 llvm/unittests/CGData/StableFunctionMapTest.cpp

diff --git a/llvm/include/llvm/CGData/StableFunctionMap.h b/llvm/include/llvm/CGData/StableFunctionMap.h
new file mode 100644
index 00000000000000..81b48cdcf66f58
--- /dev/null
+++ b/llvm/include/llvm/CGData/StableFunctionMap.h
@@ -0,0 +1,134 @@
+//===- StableFunctionMap.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// TODO
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_CGDATA_STABLEFUNCTIONMAP_H
+#define LLVM_CGDATA_STABLEFUNCTIONMAP_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/StructuralHash.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+
+
+using IndexPairHash = std::pair<IndexPair, stable_hash>;
+using IndexOperandHashVecType = SmallVector<IndexPairHash>;
+
+/// A stable function is a function with a stable hash while tracking the
+/// locations of ignored operands and their hashes.
+struct StableFunction {
+  /// The combined stable hash of the function.
+  stable_hash Hash;
+  /// The name of the function.
+  std::string FunctionName;
+  /// The name of the module the function is in.
+  std::string ModuleName;
+  /// The number of instructions.
+  unsigned InstCount;
+  /// A vector of pairs of IndexPair and operand hash which was skipped.
+  IndexOperandHashVecType IndexOperandHashes;
+
+  StableFunction(stable_hash Hash, std::string FunctionName,
+                 std::string ModuleName, unsigned InstCount,
+                 IndexOperandHashVecType &&IndexOperandHashes)
+      : Hash(Hash), FunctionName(FunctionName), ModuleName(ModuleName),
+        InstCount(InstCount),
+        IndexOperandHashes(std::move(IndexOperandHashes)) {}
+  StableFunction() = default;
+};
+
+/// An efficient form of StableFunction for fast look-up
+struct StableFunctionEntry {
+  /// The combined stable hash of the function.
+  stable_hash Hash;
+  /// Id of the function name.
+  unsigned FunctionNameId;
+  /// Id of the module name.
+  unsigned ModuleNameId;
+  /// The number of instructions.
+  unsigned InstCount;
+  /// A map from an IndexPair to a stable_hash which was skipped.
+  std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap;
+
+  StableFunctionEntry(
+      stable_hash Hash, unsigned FunctionNameId, unsigned ModuleNameId,
+      unsigned InstCount,
+      std::unique_ptr<IndexOperandHashMapType> IndexOperandHashMap)
+      : Hash(Hash), FunctionNameId(FunctionNameId), ModuleNameId(ModuleNameId),
+        InstCount(InstCount),
+        IndexOperandHashMap(std::move(IndexOperandHashMap)) {}
+};
+
+using HashFuncsMapType =
+    DenseMap<stable_hash, SmallVector<std::unique_ptr<StableFunctionEntry>>>;
+
+class StableFunctionMap {
+  /// A map from a stable_hash to a vector of functions with that hash.
+  HashFuncsMapType HashToFuncs;
+  /// A vector of strings to hold names.
+  SmallVector<std::string> IdToName;
+  /// A map from StringRef (name) to an ID.
+  StringMap<unsigned> NameToId;
+
+public:
+  /// Get the HashToFuncs map for serialization.
+  const HashFuncsMapType &getFunctionMap() { return HashToFuncs; }
+
+  /// Get the NameToId vector for serialization.
+  const SmallVector<std::string> getNames() { return IdToName; }
+
+  /// Get an existing ID associated with the given name or create a new ID if it
+  /// doesn't exist.
+  unsigned getIdOrCreateForName(StringRef Name);
+
+  /// Get the name associated with a given ID
+  std::optional<std::string> getNameForId(unsigned Id) const;
+
+  /// Insert a `StableFunction` object into the function map. This method
+  /// handles the uniquing of string names and create a `StableFunctionEntry`
+  /// for insertion.
+  void insert(const StableFunction &Func);
+
+  /// Insert a `StableFunctionEntry` into the function map directly. This
+  /// method assumes that string names have already been uniqued and the
+  /// `StableFunctionEntry` is ready for insertion.
+  void insert(std::unique_ptr<StableFunctionEntry> FuncEntry) {
+    HashToFuncs[FuncEntry->Hash].emplace_back(std::move(FuncEntry));
+  }
+
+  /// Merge a \p OtherMap into this function map.
+  void merge(const StableFunctionMap &OtherMap);
+
+  /// \returns true if there is no stable function entry.
+  bool empty() { return size() == 0; }
+
+  enum SizeType {
+    UniqueHashCount,        // The number of unique hashes in HashToFuncs.
+    TotalFunctionCount,     // The number of total functions in HashToFuncs.
+    MergeableFunctionCount, // The number of functions that can be merged based
+                            // on their hash.
+  };
+
+  /// \returns the size of StableFunctionMap.
+  /// \p Type is the type of size to return.
+  size_t size(SizeType Type = UniqueHashCount) const;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CGData/StableFunctionMapRecord.h b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
new file mode 100644
index 00000000000000..cbe7e295e461b6
--- /dev/null
+++ b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
@@ -0,0 +1,61 @@
+//===- StableFunctionMapRecord.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// TODO
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_CGDATA_STABLEFUNCTIONMAPRECORD_H
+#define LLVM_CGDATA_STABLEFUNCTIONMAPRECORD_H
+
+#include "llvm/CGData/StableFunctionMap.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+
+struct StableFunctionMapRecord {
+  std::unique_ptr<StableFunctionMap> FunctionMap;
+
+  StableFunctionMapRecord() {
+    FunctionMap = std::make_unique<StableFunctionMap>();
+  }
+  StableFunctionMapRecord(std::unique_ptr<StableFunctionMap> FunctionMap)
+      : FunctionMap(std::move(FunctionMap)){};
+
+  /// Serialize the stable function map to a raw_ostream.
+  void serialize(raw_ostream &OS) const;
+
+  /// Deserialize the stable function map from a raw_ostream.
+  void deserialize(const unsigned char *&Ptr);
+
+  /// Serialize the stable function map to a YAML stream.
+  void serializeYAML(yaml::Output &YOS) const;
+
+  /// Deserialize the stable function map from a YAML stream.
+  void deserializeYAML(yaml::Input &YIS);
+
+  /// Merge the stable function map into this one.
+  void merge(const StableFunctionMapRecord &Other) {
+    FunctionMap->merge(*Other.FunctionMap);
+  }
+
+  /// \returns true if the stable function map is empty.
+  bool empty() const { return FunctionMap->empty(); }
+
+  /// Print the stable function map in a YAML format.
+  void print(raw_ostream &OS = llvm::errs()) const {
+    yaml::Output YOS(OS);
+    serializeYAML(YOS);
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt
index ff1aab920e7a8c..18f6b81dc1f11a 100644
--- a/llvm/lib/CGData/CMakeLists.txt
+++ b/llvm/lib/CGData/CMakeLists.txt
@@ -4,6 +4,8 @@ add_llvm_component_library(LLVMCGData
   CodeGenDataWriter.cpp
   OutlinedHashTree.cpp
   OutlinedHashTreeRecord.cpp
+  StableFunctionMap.cpp
+  StableFunctionMapRecord.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CGData
diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp
new file mode 100644
index 00000000000000..1215dd48e0921e
--- /dev/null
+++ b/llvm/lib/CGData/StableFunctionMap.cpp
@@ -0,0 +1,86 @@
+//===-- StableFunctionMap.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CGData/StableFunctionMap.h"
+
+#define DEBUG_TYPE "stable-function-map"
+
+using namespace llvm;
+
+unsigned StableFunctionMap::getIdOrCreateForName(StringRef Name) {
+  auto It = NameToId.find(Name);
+  if (It != NameToId.end()) {
+    return It->second;
+  } else {
+    unsigned Id = IdToName.size();
+    assert(Id == NameToId.size() && "ID collision");
+    IdToName.emplace_back(Name.str());
+    NameToId[IdToName.back()] = Id;
+    return Id;
+  }
+}
+
+std::optional<std::string> StableFunctionMap::getNameForId(unsigned Id) const {
+  if (Id >= IdToName.size())
+    return std::nullopt;
+  return IdToName[Id];
+}
+
+void StableFunctionMap::insert(const StableFunction &Func) {
+  auto FuncNameId = getIdOrCreateForName(Func.FunctionName);
+  auto ModuleNameId = getIdOrCreateForName(Func.ModuleName);
+  auto IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
+  for (auto &[Index, Hash] : Func.IndexOperandHashes)
+    (*IndexOperandHashMap)[Index] = Hash;
+  auto FuncEntry = std::make_unique<StableFunctionEntry>(
+      Func.Hash, FuncNameId, ModuleNameId, Func.InstCount,
+      std::move(IndexOperandHashMap));
+  insert(std::move(FuncEntry));
+}
+
+void StableFunctionMap::merge(const StableFunctionMap &OtherMap) {
+  for (auto &[Hash, Funcs] : OtherMap.HashToFuncs) {
+    auto &ThisFuncs = HashToFuncs[Hash];
+    for (auto &Func : Funcs) {
+      auto FuncNameId =
+          getIdOrCreateForName(*OtherMap.getNameForId(Func->FunctionNameId));
+      auto ModuleNameId =
+          getIdOrCreateForName(*OtherMap.getNameForId(Func->ModuleNameId));
+      auto ClonedIndexOperandHashMap =
+          std::make_unique<IndexOperandHashMapType>(*Func->IndexOperandHashMap);
+      ThisFuncs.emplace_back(std::make_unique<StableFunctionEntry>(
+          Func->Hash, FuncNameId, ModuleNameId, Func->InstCount,
+          std::move(ClonedIndexOperandHashMap)));
+    }
+  }
+}
+
+size_t StableFunctionMap::size(SizeType Type) const {
+  switch (Type) {
+  case UniqueHashCount:
+    return HashToFuncs.size();
+  case TotalFunctionCount: {
+    size_t Count = 0;
+    for (auto &Funcs : HashToFuncs)
+      Count += Funcs.second.size();
+    return Count;
+  }
+  case MergeableFunctionCount: {
+    size_t Count = 0;
+    for (auto &[Hash, Funcs] : HashToFuncs)
+      if (Funcs.size() >= 2)
+        Count += Funcs.size();
+    return Count;
+  }
+  }
+  return 0;
+}
diff --git a/llvm/lib/CGData/StableFunctionMapRecord.cpp b/llvm/lib/CGData/StableFunctionMapRecord.cpp
new file mode 100644
index 00000000000000..90615265f4a73d
--- /dev/null
+++ b/llvm/lib/CGData/StableFunctionMapRecord.cpp
@@ -0,0 +1,182 @@
+//===-- StableFunctionMapRecord.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CGData/StableFunctionMapRecord.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+#define DEBUG_TYPE "stable-function-map-record"
+
+using namespace llvm;
+using namespace llvm::support;
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(IndexPairHash)
+LLVM_YAML_IS_SEQUENCE_VECTOR(StableFunction)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<IndexPairHash> {
+  static void mapping(IO &IO, IndexPairHash &Key) {
+    IO.mapRequired("InstIndex", Key.first.first);
+    IO.mapRequired("OpndIndex", Key.first.second);
+    IO.mapRequired("OpndHash", Key.second);
+  }
+};
+
+template <> struct MappingTraits<StableFunction> {
+  static void mapping(IO &IO, StableFunction &Func) {
+    IO.mapRequired("Hash", Func.Hash);
+    IO.mapRequired("FunctionName", Func.FunctionName);
+    IO.mapRequired("ModuleName", Func.ModuleName);
+    IO.mapRequired("InstCount", Func.InstCount);
+    IO.mapRequired("IndexOperandHashes", Func.IndexOperandHashes);
+  }
+};
+
+} // namespace yaml
+} // namespace llvm
+
+// Get a sorted vector of StableFunctionEntry pointers.
+static SmallVector<const StableFunctionEntry *>
+getStableFunctionEntries(StableFunctionMap &SFM) {
+  SmallVector<const StableFunctionEntry *> FuncEntries;
+  for (const auto &P : SFM.getFunctionMap())
+    for (auto &Func : P.second)
+      FuncEntries.emplace_back(Func.get());
+
+  std::stable_sort(
+      FuncEntries.begin(), FuncEntries.end(), [&](auto &A, auto &B) {
+        return std::tuple(A->Hash, SFM.getNameForId(A->ModuleNameId),
+                          SFM.getNameForId(A->FunctionNameId)) <
+               std::tuple(B->Hash, SFM.getNameForId(B->ModuleNameId),
+                          SFM.getNameForId(B->FunctionNameId));
+      });
+  return FuncEntries;
+}
+
+// Get a sorted vector of IndexOperandHashes.
+static IndexOperandHashVecType
+getStableIndexOperandHashes(const StableFunctionEntry *FuncEntry) {
+  IndexOperandHashVecType IndexOperandHashes;
+  for (auto &[Indices, OpndHash] : *FuncEntry->IndexOperandHashMap)
+    IndexOperandHashes.emplace_back(Indices, OpndHash);
+  std::sort(IndexOperandHashes.begin(), IndexOperandHashes.end(),
+            [](auto &A, auto &B) { return A.first < B.first; });
+  return IndexOperandHashes;
+}
+
+void StableFunctionMapRecord::serialize(raw_ostream &OS) const {
+  support::endian::Writer Writer(OS, endianness::little);
+
+  // Write Names.
+  auto &Names = FunctionMap->getNames();
+  Writer.write<uint32_t>(Names.size());
+  for (auto &Name : Names)
+    Writer.OS << Name << '\0';
+
+  // Write StableFunctionEntries whose pointers are sorted.
+  auto FuncEntries = getStableFunctionEntries(*FunctionMap);
+  Writer.write<uint32_t>(FuncEntries.size());
+
+  for (const auto *FuncRef : FuncEntries) {
+    Writer.write<stable_hash>(FuncRef->Hash);
+    Writer.write<uint32_t>(FuncRef->FunctionNameId);
+    Writer.write<uint32_t>(FuncRef->ModuleNameId);
+    Writer.write<uint32_t>(FuncRef->InstCount);
+
+    // Emit IndexOperandHashes sorted from IndexOperandHashMap.
+    IndexOperandHashVecType IndexOperandHashes =
+        getStableIndexOperandHashes(FuncRef);
+    Writer.write<uint32_t>(IndexOperandHashes.size());
+    for (auto &IndexOperandHash : IndexOperandHashes) {
+      Writer.write<uint32_t>(IndexOperandHash.first.first);
+      Writer.write<uint32_t>(IndexOperandHash.first.second);
+      Writer.write<stable_hash>(IndexOperandHash.second);
+    }
+  }
+}
+
+void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr) {
+
+  // Read Names.
+  auto NumNames =
+      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  for (unsigned I = 0; I < NumNames; ++I) {
+    std::string Name(reinterpret_cast<const char *>(Ptr));
+    Ptr += Name.size() + 1;
+    FunctionMap->getIdOrCreateForName(Name);
+  }
+
+  // Read StableFunctionEntries.
+  auto NumFuncs =
+      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  for (unsigned I = 0; I < NumFuncs; ++I) {
+    auto Hash =
+        endian::readNext<stable_hash, endianness::little, unaligned>(Ptr);
+    auto FunctionNameId =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    assert(FunctionNameId < IdToName.size() && "FunctionNameId out of range");
+    auto ModuleNameId =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    assert(ModuleNameId < IdToName.size() && "ModuleNameId out of range");
+    auto InstCount =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+
+    // Read IndexOperandHashes to build IndexOperandHashMap
+    auto NumIndexOperandHashes =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    auto IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
+    for (unsigned J = 0; J < NumIndexOperandHashes; ++J) {
+      auto InstIndex =
+          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+      auto OpndIndex =
+          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+      auto OpndHash =
+          endian::readNext<stable_hash, endianness::little, unaligned>(Ptr);
+      assert(InstIndex < InstCount && "InstIndex out of range");
+
+      auto Indices = std::make_pair(InstIndex, OpndIndex);
+      (*IndexOperandHashMap)[Indices] = OpndHash;
+    }
+
+    // Insert a new StableFunctionEntry into the map.
+    auto FuncEntry = std::make_unique<StableFunctionEntry>(
+        Hash, FunctionNameId, ModuleNameId, InstCount,
+        std::move(IndexOperandHashMap));
+
+    FunctionMap->insert(std::move(FuncEntry));
+  }
+}
+
+void StableFunctionMapRecord::serializeYAML(yaml::Output &YOS) const {
+  auto FuncEntries = getStableFunctionEntries(*FunctionMap);
+  SmallVector<StableFunction> Functions;
+  for (const auto *FuncEntry : FuncEntries) {
+    auto IndexOperandHashes = getStableIndexOperandHashes(FuncEntry);
+    Functions.emplace_back(
+        FuncEntry->Hash, *FunctionMap->getNameForId(FuncEntry->FunctionNameId),
+        *FunctionMap->getNameForId(FuncEntry->ModuleNameId),
+        FuncEntry->InstCount, std::move(IndexOperandHashes));
+  }
+
+  YOS << Functions;
+}
+
+void StableFunctionMapRecord::deserializeYAML(yaml::Input &YIS) {
+  std::vector<StableFunction> Funcs;
+  YIS >> Funcs;
+  for (auto &Func : Funcs)
+    FunctionMap->insert(Func);
+  YIS.nextDocument();
+}
diff --git a/llvm/unittests/CGData/CMakeLists.txt b/llvm/unittests/CGData/CMakeLists.txt
index 792b323130b474..0bdb9e1f08c702 100644
--- a/llvm/unittests/CGData/CMakeLists.txt
+++ b/llvm/unittests/CGData/CMakeLists.txt
@@ -9,6 +9,8 @@ set(LLVM_LINK_COMPONENTS
 add_llvm_unittest(CGDataTests
   OutlinedHashTreeRecordTest.cpp
   OutlinedHashTreeTest.cpp
+  StableFunctionMapRecordTest.cpp
+  StableFunctionMapTest.cpp
   )
 
 target_link_libraries(CGDataTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CGData/StableFunctionMapRecordTest.cpp b/llvm/unittests/CGData/StableFunctionMapRecordTest.cpp
new file mode 100644
index 00000000000000..77d7e85bb7973b
--- /dev/null
+++ b/llvm/unittests/CGData/StableFunctionMapRecordTest.cpp
@@ -0,0 +1,131 @@
+//===- StableFunctionMapRecordTest.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CGData/StableFunctionMapRecord.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(StableFunctionMapRecordTest, Print) {
+  StableFunctionMapRecord MapRecord;
+  StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}};
+  MapRecord.FunctionMap->insert(Func1);
+
+  const char *ExpectedMapStr = R"(---
+- Hash:            1
+  FunctionName:    Func1
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        3
+...
+)";
+  std::string MapDump;
+  raw_string_ostream OS(MapDump);
+  MapRecord.print(OS);
+  EXPECT_EQ(ExpectedMapStr, MapDump);
+}
+
+TEST(OutlinedHashTreeRecordTest, Stable) {
+  StableFunction Func1{1, "Func2", "Mod1", 1, {}};
+  StableFunction Func2{1, "Func3", "Mod1", 1, {}};
+  StableFunction Func3{1, "Func1", "Mod2", 1, {}};
+  StableFunction Func4{2, "Func4", "Mod3", 1, {}};
+
+  StableFunctionMapRecord MapRecord1;
+  MapRecord1.FunctionMap->insert(Func1);
+  MapRecord1.FunctionMap->insert(Func2);
+  MapRecord1.FunctionMap->insert(Func3);
+  MapRecord1.FunctionMap->insert(Func4);
+
+  StableFunctionMapRecord MapRecord2;
+  MapRecord2.FunctionMap->insert(Func4);
+  MapRecord2.FunctionMap->insert(Func3);
+  MapRecord2.FunctionMap->insert(Func2);
+  MapRecord2.FunctionMap->insert(Func1);
+
+  // Output is sorted by hash (1 < 2), module name (Mod1 < Mod2), and function
+  // name (Func2 < Func3).
+  std::string MapDump1;
+  raw_string_ostream OS1(MapDump1);
+  MapRecord1.print(OS1);
+  std::string MapDump2;
+  raw_string_ostream OS2(MapDump2);
+  MapRecord2.print(OS2);
+  EXPECT_EQ(MapDump1, MapDump2);
+}
+
+TEST(StableFunctionMapRecordTest, Serialize) {
+  StableFunctionMapRecord MapRecord1;
+  StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}, {{1, 2}, 4}}};
+  StableFunction Func2{2, "Func2", "Mod1", 3, {{{0, 1}, 2}}};
+  StableFunction Func3{2, "Func3", "Mod1", 3, {{{0, 1}, 3}}};
+  MapRecord1.FunctionMap->insert(Func1);
+  MapRecord1.FunctionMap->insert(Func2);
+  MapRecord1.FunctionMap->insert(Func3);
+
+  // Serialize and deserialize the tree.
+  SmallVector<char> Out;
+  raw_svector_ostream OS(Out);
+  MapRecord1.serialize(OS);
+
+  StableFunctionMapRecord MapRecord2;
+  const uint8_t *Data = reinterpret_cast<const uint8_t *>(Out.data());
+  MapRecord2.deserialize(Data);
+
+  // Two maps should be identical.
+  std::string MapDump1;
+  raw_string_ostream OS1(MapDump1);
+  MapRecord1.print(OS1);
+  MapRecord1.print();
+  std::string MapDump2;
+  raw_string_ostream OS2(MapDump2);
+  MapRecord2.print(OS2);
+  MapRecord2.print();
+
+  EXPECT_EQ(MapDump1, MapDump2);
+}
+
+TEST(StableFunctionMapRecordTest, SerializeYAML) {
+  StableFunctionMapRecord MapRecord1;
+  StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}, {{1, 2}, 4}}};
+  StableFunction Func2{2, "Func2", "Mod1", 3, {{{0, 1}, 2}}};
+  StableFunction Func3{2, "Func3", "Mod1", 3, {{{0, 1}, 3}}};
+  MapRecord1.FunctionMap->insert(Func1);
+  MapRecord1.FunctionMap->insert(Func2);
+  MapRecord1.FunctionMap->insert(Func3);
+
+  // Serialize and deserialize the map in a YAML format.
+  std::string Out;
+  raw_string_ostream OS(Out);
+  yaml::Output YOS(OS);
+  MapRecord1.serializeYAML(YOS);
+
+  StableFunctionMapRecord MapRecord2;
+  yaml::Input YIS(StringRef(Out.data(), Out.size()));
+  MapRecord2.deserializeYAML(YIS);
+
+  // Two maps should be identical.
+  std::string MapDump1;
+  raw_string_ostream OS1(MapDump1);
+  MapRecord1.print(OS1);
+  MapRecord1.print();
+  std::string MapDump2;
+  raw_string_ostream OS2(MapDump2);
+  MapRecord2.print(OS2);
+  MapRecord2.print();
+
+  EXPECT_EQ(MapDump1, MapDump2);
+}
+
+} // end namespace
diff --git a/llvm/unittests/CGData/StableFunctionMapTest.cpp b/llvm/unittests/CGData/StableFunctionMapTest.cpp
new file mode 100644
index 00000000000000..e9bdbcd91b0f2a
--- /dev/null
+++ b/llvm/unittests/CGData/StableFunctionMapTest.cpp
@@ -0,0 +1,104 @@
+//===- StableFunctionMapTest.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CGData/StableFunctionMap.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(StableFunctionMap, Name) {
+  StableFunctionMap Map;
+  EXPECT_TRUE(Map.empty());
+  EXPECT_TRUE(Map.getNames().empty());
+  unsigned ID1 = Map.getIdOrCreateForName("Func1");
+  unsigned ID2 = Map.getIdOrCreateForName("Func2");
+  unsigned ID3 = Map.getIdOrCreateForName("Func1");
+
+  EXPECT_EQ(Map.getNames().size(), 2u);
+  // The different names should return different IDs.
+  EXPECT_NE(ID1, ID2);
+  // The same name should return the same ID.
+  EXPECT_EQ(ID1, ID3);
+  // The IDs should be valid.
+  EXPECT_EQ(*Map.getNameForId(ID1), "Func1");
+  EXPECT_EQ(*Map.getNameForId(ID2), "Func2");
+}
+
+TEST(StableFunctionMap, Insert) {
+  StableFunctionMap Map;
+
+  StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}};
+  StableFunction Func2{1, "Func2", "Mod1", 2, {{{0, 1}, 2}}};
+  Map.insert(Func1);
+  Map.insert(Func2);
+  // We only have a unique hash, 1
+  EXPECT_EQ(Map.size(), 1u);
+  // We have two functions with the same hash which are potentially mergeable.
+  EXPECT_EQ(Map.size(StableFunctionMap::SizeType::TotalFunctionCount), 2u);
+  EXPECT_EQ(Map.size(StableFunctionMap::SizeType::MergeableFunctionCount), 2u);
+}
+
+TEST(StableFunctionMap, InsertEntry) {
+  StableFunctionMap Map;
+
+  unsigned ID1 = Map.getIdOrCreateForName("Func1");
+  unsigned ID2 = Map.getIdOrCreateForName("Mod1");
+  unsigned ID3 = Map.getIdOrCreateForName("Func2");
+
+  // Create a function entry and insert it into the map.
+  auto IndexOperandHashMap1 = std::make_unique<IndexOperandHashMapType>();
+  IndexOperandHashMap1->insert({{0, 1}, 3});
+  auto FuncEntry1 = std::make_unique<StableFunctionEntry>(
+      1, ID1, ID2, 2, std::move(IndexOperandHashMap1));
+  Map.insert(std::move(FuncEntry1));
+
+  // Create another function entry and insert it into the map.
+  auto IndexOperandHashMap2 = std::make_unique<IndexOperandHashMapType>();
+  IndexOperandHashMap2->insert({{0, 1}, 2});
+  auto FuncEntry2 = std::make_unique<StableFunctionEntry>(
+      1, ID3, ID2, 2, std::move(IndexOperandHashMap2));
+  Map.insert(std::move(FuncEntry2));
+
+  // We only have a unique hash, 1
+  EXPECT_EQ(Map.size(), 1u);
+  // We have two functions with the same hash which are potentially mergeable.
+  EXPECT_EQ(Map.size(StableFunctionMap::SizeType::TotalFunctionCount), 2u);
+}
+
+TEST(OutlinedHashTreeTest, Merge) {
+  StableFunctionMap Map1;
+  StableFunction Func1{1, "Func1", "Mod1", 2, {{{0, 1}, 3}}};
+  StableFunction Func2{1, "Func2", "Mod1", 2, {{{0, 1}, 2}}};
+  StableFunction Func3{2, "Func3", "Mod1", 2, {{{1, 1}, 2}}};
+  Map1.insert(Func1);
+  Map1.insert(Func2);
+  Map1.insert(Func3);
+
+  StableFunctionMap Map2;
+  StableFunction Func4{1, "Func4", "Mod2", 2, {{{0, 1}, 4}}};
+  StableFunction Func5{2, "Func5", "Mod2", 2, {{{1, 1}, 5}}};
+  StableFunction Func6{3, "Func6", "Mod2", 2, {{{1, 1}, 6}}};
+  Map2.insert(Func4);
+  Map2.insert(Func5);
+  Map2.insert(Func6);
+
+  // Merge two maps.
+  Map1.merge(Map2);
+
+  // We only have two unique hashes, 1, 2 and 3
+  EXPECT_EQ(Map1.size(), 3u);
+  // We have total 6 functions.
+  EXPECT_EQ(Map1.size(StableFunctionMap::SizeType::TotalFunctionCount), 6u);
+  // We have 5 mergeable functions. Func6 only has a unique hash, 3.
+  EXPECT_EQ(Map1.size(StableFunctionMap::SizeType::MergeableFunctionCount), 5u);
+}
+
+} // end namespace

>From 2b3df2c29db3e643495d3ad2e3180cc51573c1d3 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Mon, 9 Sep 2024 19:38:05 -0700
Subject: [PATCH 07/10] [CGData][llvm-cgdata]

---
 llvm/include/llvm/CGData/CodeGenData.h        | 24 +++++-
 llvm/include/llvm/CGData/CodeGenData.inc      | 12 ++-
 llvm/include/llvm/CGData/CodeGenDataReader.h  | 27 +++++-
 llvm/include/llvm/CGData/CodeGenDataWriter.h  | 17 +++-
 llvm/lib/CGData/CodeGenData.cpp               | 29 ++++---
 llvm/lib/CGData/CodeGenDataReader.cpp         | 36 +++++---
 llvm/lib/CGData/CodeGenDataWriter.cpp         | 31 +++++--
 llvm/lib/CGData/StableFunctionMapRecord.cpp   | 15 +++-
 llvm/test/tools/llvm-cgdata/empty.test        |  8 +-
 llvm/test/tools/llvm-cgdata/error.test        | 13 +--
 .../merge-combined-funcmap-hashtree.test      | 66 +++++++++++++++
 .../llvm-cgdata/merge-funcmap-archive.test    | 83 +++++++++++++++++++
 .../llvm-cgdata/merge-funcmap-concat.test     | 78 +++++++++++++++++
 .../llvm-cgdata/merge-funcmap-double.test     | 79 ++++++++++++++++++
 .../llvm-cgdata/merge-funcmap-single.test     | 36 ++++++++
 ...chive.test => merge-hashtree-archive.test} |  8 +-
 ...concat.test => merge-hashtree-concat.test} |  6 +-
 ...double.test => merge-hashtree-double.test} |  8 +-
 ...single.test => merge-hashtree-single.test} |  4 +-
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        | 46 +++++++---
 20 files changed, 556 insertions(+), 70 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
 rename llvm/test/tools/llvm-cgdata/{merge-archive.test => merge-hashtree-archive.test} (91%)
 rename llvm/test/tools/llvm-cgdata/{merge-concat.test => merge-hashtree-concat.test} (93%)
 rename llvm/test/tools/llvm-cgdata/{merge-double.test => merge-hashtree-double.test} (90%)
 rename llvm/test/tools/llvm-cgdata/{merge-single.test => merge-hashtree-single.test} (92%)

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 72b52e6e9b8fd1..ceaa8790aff8d4 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -18,6 +18,7 @@
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/CGData/OutlinedHashTree.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/StableFunctionMapRecord.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -39,7 +40,9 @@ enum class CGDataKind {
   Unknown = 0x0,
   // A function outlining info.
   FunctionOutlinedHashTree = 0x1,
-  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionOutlinedHashTree)
+  // A function merging info.
+  StableFunctionMergingMap = 0x2,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/StableFunctionMergingMap)
 };
 
 const std::error_category &cgdata_category();
@@ -106,6 +109,8 @@ enum CGDataMode {
 class CodeGenData {
   /// Global outlined hash tree that has oulined hash sequences across modules.
   std::unique_ptr<OutlinedHashTree> PublishedHashTree;
+  /// Global stable function map that has stable function info across modules.
+  std::unique_ptr<StableFunctionMap> PublishedStableFunctionMap;
 
   /// This flag is set when -fcodegen-data-generate is passed.
   /// Or, it can be mutated with -fcodegen-data-thinlto-two-rounds.
@@ -129,6 +134,9 @@ class CodeGenData {
   bool hasOutlinedHashTree() {
     return PublishedHashTree && !PublishedHashTree->empty();
   }
+  bool hasStableFunctionMap() {
+    return PublishedStableFunctionMap && !PublishedStableFunctionMap->empty();
+  }
 
   /// Returns the outlined hash tree. This can be globally used in a read-only
   /// manner.
@@ -145,6 +153,12 @@ class CodeGenData {
     // Ensure we disable emitCGData as we do not want to read and write both.
     EmitCGData = false;
   }
+  void
+  publishStableFunctionMap(std::unique_ptr<StableFunctionMap> FunctionMap) {
+    PublishedStableFunctionMap = std::move(FunctionMap);
+    // Ensure we disable emitCGData as we do not want to read and write both.
+    EmitCGData = false;
+  }
 };
 
 namespace cgdata {
@@ -164,6 +178,11 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
+inline void
+publishStableFunctionMap(std::unique_ptr<StableFunctionMap> FunctionMap) {
+  CodeGenData::getInstance().publishStableFunctionMap(std::move(FunctionMap));
+}
+
 void initializeTwoCodegenRounds();
 
 /// Save \p TheModule before the first codegen round.
@@ -196,6 +215,8 @@ enum CGDataVersion {
   // Version 1 is the first version. This version supports the outlined
   // hash tree.
   Version1 = 1,
+  // Version 2 supports the stable function merging map.
+  Version2 = 2,
   CurrentVersion = CG_DATA_INDEX_VERSION
 };
 const uint64_t Version = CGDataVersion::CurrentVersion;
@@ -205,6 +226,7 @@ struct Header {
   uint32_t Version;
   uint32_t DataKind;
   uint64_t OutlinedHashTreeOffset;
+  uint64_t StableFunctionMapOffset;
 
   // New fields should only be added at the end to ensure that the size
   // computation is correct. The methods below need to be updated to ensure that
diff --git a/llvm/include/llvm/CGData/CodeGenData.inc b/llvm/include/llvm/CGData/CodeGenData.inc
index 08ec14ea051a0c..e0ae7a51024d87 100644
--- a/llvm/include/llvm/CGData/CodeGenData.inc
+++ b/llvm/include/llvm/CGData/CodeGenData.inc
@@ -20,6 +20,8 @@
 #define CG_DATA_DEFINED
 CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON),
                    CG_DATA_OUTLINE_COFF, "__DATA,")
+CG_DATA_SECT_ENTRY(CG_merge, CG_DATA_QUOTE(CG_DATA_MERGE_COMMON),
+                   CG_DATA_MERGE_COFF, "__DATA,")
 
 #undef CG_DATA_SECT_ENTRY
 #endif
@@ -27,20 +29,24 @@ CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON),
 /* section name strings common to all targets other
    than WIN32 */
 #define CG_DATA_OUTLINE_COMMON __llvm_outline
+#define CG_DATA_MERGE_COMMON __llvm_merge
 /* Since cg data sections are not allocated, we don't need to
  * access them at runtime.
  */
 #define CG_DATA_OUTLINE_COFF ".loutline"
+#define CG_DATA_MERGE_COFF ".lmerge"
 
 #ifdef _WIN32
 /* Runtime section names and name strings.  */
-#define CG_DATA_SECT_NAME CG_DATA_OUTLINE_COFF
+#define CG_DATA_OUTLINE_SECT_NAME CG_DATA_OUTLINE_COFF
+#define CG_DATA_MERGE_SECT_NAME CG_DATA_MERGE_COFF
 
 #else
 /* Runtime section names and name strings.  */
-#define CG_DATA_SECT_NAME CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON)
+#define CG_DATA_OUTLINE_SECT_NAME CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON)
+#define CG_DATA_MERGE_SECT_NAME CG_DATA_QUOTE(CG_DATA_MERGE_COMMON)
 
 #endif
 
 /* Indexed codegen data format version (start from 1). */
-#define CG_DATA_INDEX_VERSION 1
+#define CG_DATA_INDEX_VERSION 2
diff --git a/llvm/include/llvm/CGData/CodeGenDataReader.h b/llvm/include/llvm/CGData/CodeGenDataReader.h
index 1ee4bfbe480233..2891e611430b02 100644
--- a/llvm/include/llvm/CGData/CodeGenDataReader.h
+++ b/llvm/include/llvm/CGData/CodeGenDataReader.h
@@ -15,6 +15,7 @@
 
 #include "llvm/CGData/CodeGenData.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/StableFunctionMapRecord.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -36,10 +37,15 @@ class CodeGenDataReader {
   virtual CGDataKind getDataKind() const = 0;
   /// Return true if the data has an outlined hash tree.
   virtual bool hasOutlinedHashTree() const = 0;
+  /// Return true if the data has a stable function map.
+  virtual bool hasStableFunctionMap() const = 0;
   /// Return the outlined hash tree that is released from the reader.
   std::unique_ptr<OutlinedHashTree> releaseOutlinedHashTree() {
     return std::move(HashTreeRecord.HashTree);
   }
+  std::unique_ptr<StableFunctionMap> releaseStableFunctionMap() {
+    return std::move(FunctionMapRecord.FunctionMap);
+  }
 
   /// Factory method to create an appropriately typed reader for the given
   /// codegen data file path and file system.
@@ -54,14 +60,20 @@ class CodeGenDataReader {
   /// Extract the cgdata embedded in sections from the given object file and
   /// merge them into the GlobalOutlineRecord. This is a static helper that
   /// is used by `llvm-cgdata --merge` or ThinLTO's two-codegen rounds.
-  static Error mergeFromObjectFile(const object::ObjectFile *Obj,
-                                   OutlinedHashTreeRecord &GlobalOutlineRecord);
+  static Error
+  mergeFromObjectFile(const object::ObjectFile *Obj,
+                      OutlinedHashTreeRecord &GlobalOutlineRecord,
+                      StableFunctionMapRecord &GlobalFunctionMapRecord);
 
 protected:
   /// The outlined hash tree that has been read. When it's released by
   /// releaseOutlinedHashTree(), it's no longer valid.
   OutlinedHashTreeRecord HashTreeRecord;
 
+  /// The stable function map that has been read. When it's released by
+  // releaseStableFunctionMap(), it's no longer valid.
+  StableFunctionMapRecord FunctionMapRecord;
+
   /// Set the current error and return same.
   Error error(cgdata_error Err, const std::string &ErrMsg = "") {
     LastError = Err;
@@ -112,6 +124,11 @@ class IndexedCodeGenDataReader : public CodeGenDataReader {
     return Header.DataKind &
            static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
   }
+  /// Return true if the header indicates the data has a stable function map.
+  bool hasStableFunctionMap() const override {
+    return Header.DataKind &
+           static_cast<uint32_t>(CGDataKind::StableFunctionMergingMap);
+  }
 };
 
 /// This format is a simple text format that's suitable for test data.
@@ -147,6 +164,12 @@ class TextCodeGenDataReader : public CodeGenDataReader {
     return static_cast<uint32_t>(DataKind) &
            static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
   }
+  /// Return true if the header indicates the data has a stable function map.
+  /// This does not mean that the data is still available.
+  bool hasStableFunctionMap() const override {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::StableFunctionMergingMap);
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/CGData/CodeGenDataWriter.h b/llvm/include/llvm/CGData/CodeGenDataWriter.h
index 5cb8377b1d07e5..1c4247608999a7 100644
--- a/llvm/include/llvm/CGData/CodeGenDataWriter.h
+++ b/llvm/include/llvm/CGData/CodeGenDataWriter.h
@@ -15,6 +15,7 @@
 
 #include "llvm/CGData/CodeGenData.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/StableFunctionMapRecord.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
 
@@ -57,6 +58,9 @@ class CodeGenDataWriter {
   /// The outlined hash tree to be written.
   OutlinedHashTreeRecord HashTreeRecord;
 
+  /// The stable function map to be written.
+  StableFunctionMapRecord FunctionMapRecord;
+
   /// A bit mask describing the kind of the codegen data.
   CGDataKind DataKind = CGDataKind::Unknown;
 
@@ -64,9 +68,12 @@ class CodeGenDataWriter {
   CodeGenDataWriter() = default;
   ~CodeGenDataWriter() = default;
 
-  /// Add the outlined hash tree record. The input Record is released.
+  /// Add the outlined hash tree record. The input hash tree is released.
   void addRecord(OutlinedHashTreeRecord &Record);
 
+  /// Add the stable function map record. The input function map is released.
+  void addRecord(StableFunctionMapRecord &Record);
+
   /// Write the codegen data to \c OS
   Error write(raw_fd_ostream &OS);
 
@@ -81,11 +88,19 @@ class CodeGenDataWriter {
     return static_cast<uint32_t>(DataKind) &
            static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
   }
+  /// Return true if the header indicates the data has a stable function map.
+  bool hasStableFunctionMap() const {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::StableFunctionMergingMap);
+  }
 
 private:
   /// The offset of the outlined hash tree in the file.
   uint64_t OutlinedHashTreeOffset;
 
+  /// The offset of the stable function map in the file.
+  uint64_t StableFunctionMapOffset;
+
   /// Write the codegen data header to \c COS
   Error writeHeader(CGDataOStream &COS);
 
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 4e21045a67cba6..b89ad66d6ab2ea 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CGData/CodeGenDataReader.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/StableFunctionMapRecord.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -165,6 +166,8 @@ CodeGenData &CodeGenData::getInstance() {
       auto Reader = ReaderOrErr->get();
       if (Reader->hasOutlinedHashTree())
         Instance->publishOutlinedHashTree(Reader->releaseOutlinedHashTree());
+      if (Reader->hasStableFunctionMap())
+        Instance->publishStableFunctionMap(Reader->releaseStableFunctionMap());
     }
   });
   return *(Instance.get());
@@ -187,18 +190,14 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Curr) {
     return make_error<CGDataError>(cgdata_error::unsupported_version);
   H.DataKind = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
 
-  switch (H.Version) {
-    // When a new field is added to the header add a case statement here to
-    // compute the size as offset of the new field + size of the new field. This
-    // relies on the field being added to the end of the list.
-    static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version1,
-                  "Please update the size computation below if a new field has "
-                  "been added to the header, if not add a case statement to "
-                  "fall through to the latest version.");
-  case 1ull:
-    H.OutlinedHashTreeOffset =
+  static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version2,
+                "Please update the offset computation below if a new field has "
+                "been added to the header.");
+  H.OutlinedHashTreeOffset =
+      endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
+  if (H.Version >= 2)
+    H.StableFunctionMapOffset =
         endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
-  }
 
   return H;
 }
@@ -273,6 +272,7 @@ Error mergeCodeGenData(
     const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
 
   OutlinedHashTreeRecord GlobalOutlineRecord;
+  StableFunctionMapRecord GlobalStableFunctionMapRecord;
   for (auto &InputFile : *(InputFiles)) {
     if (InputFile.empty())
       continue;
@@ -285,13 +285,16 @@ Error mergeCodeGenData(
       return BinOrErr.takeError();
 
     std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
-    if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
-                                                        GlobalOutlineRecord))
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(
+            Obj.get(), GlobalOutlineRecord, GlobalStableFunctionMapRecord))
       return E;
   }
 
   if (!GlobalOutlineRecord.empty())
     cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+  if (!GlobalStableFunctionMapRecord.empty())
+    cgdata::publishStableFunctionMap(
+        std::move(GlobalStableFunctionMapRecord.FunctionMap));
 
   return Error::success();
 }
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index f7f3a8f42af7e1..0a893f9a862077 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -31,12 +31,13 @@ setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
 }
 
 Error CodeGenDataReader::mergeFromObjectFile(
-    const object::ObjectFile *Obj,
-    OutlinedHashTreeRecord &GlobalOutlineRecord) {
+    const object::ObjectFile *Obj, OutlinedHashTreeRecord &GlobalOutlineRecord,
+    StableFunctionMapRecord &GlobalFunctionMapRecord) {
   Triple TT = Obj->makeTriple();
   auto CGOutLineName =
       getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false);
-
+  auto CGMergeName =
+      getCodeGenDataSectionName(CG_merge, TT.getObjectFormat(), false);
   for (auto &Section : Obj->sections()) {
     Expected<StringRef> NameOrErr = Section.getName();
     if (!NameOrErr)
@@ -47,17 +48,23 @@ Error CodeGenDataReader::mergeFromObjectFile(
     auto *Data = reinterpret_cast<const unsigned char *>(ContentsOrErr->data());
     auto *EndData = Data + ContentsOrErr->size();
 
+    // In case dealing with an executable that has concatenated cgdata,
+    // we want to merge them into a single cgdata.
+    // Although it's not a typical workflow, we support this scenario
+    // by looping over all data in the sections.
     if (*NameOrErr == CGOutLineName) {
-      // In case dealing with an executable that has concatenated cgdata,
-      // we want to merge them into a single cgdata.
-      // Although it's not a typical workflow, we support this scenario.
       while (Data != EndData) {
         OutlinedHashTreeRecord LocalOutlineRecord;
         LocalOutlineRecord.deserialize(Data);
         GlobalOutlineRecord.merge(LocalOutlineRecord);
       }
+    } else if (*NameOrErr == CGMergeName) {
+      while (Data != EndData) {
+        StableFunctionMapRecord LocalFunctionMapRecord;
+        LocalFunctionMapRecord.deserialize(Data);
+        GlobalFunctionMapRecord.merge(LocalFunctionMapRecord);
+      }
     }
-    // TODO: Add support for other cgdata sections.
   }
 
   return Error::success();
@@ -66,7 +73,8 @@ Error CodeGenDataReader::mergeFromObjectFile(
 Error IndexedCodeGenDataReader::read() {
   using namespace support;
 
-  // The smallest header with the version 1 is 24 bytes
+  // The smallest header with the version 1 is 24 bytes.
+  // Do not update this value even with the new version of the header.
   const unsigned MinHeaderSize = 24;
   if (DataBuffer->getBufferSize() < MinHeaderSize)
     return error(cgdata_error::bad_header);
@@ -84,6 +92,12 @@ Error IndexedCodeGenDataReader::read() {
       return error(cgdata_error::eof);
     HashTreeRecord.deserialize(Ptr);
   }
+  if (hasStableFunctionMap()) {
+    const unsigned char *Ptr = Start + Header.StableFunctionMapOffset;
+    if (Ptr >= End)
+      return error(cgdata_error::eof);
+    FunctionMapRecord.deserialize(Ptr);
+  }
 
   return success();
 }
@@ -149,6 +163,8 @@ Error TextCodeGenDataReader::read() {
     StringRef Str = Line->drop_front().rtrim();
     if (Str.equals_insensitive("outlined_hash_tree"))
       DataKind |= CGDataKind::FunctionOutlinedHashTree;
+    else if (Str.equals_insensitive("stable_function_map"))
+      DataKind |= CGDataKind::StableFunctionMergingMap;
     else
       return error(cgdata_error::bad_header);
   }
@@ -167,8 +183,8 @@ Error TextCodeGenDataReader::read() {
   yaml::Input YOS(StringRef(Pos, Size));
   if (hasOutlinedHashTree())
     HashTreeRecord.deserializeYAML(YOS);
-
-  // TODO: Add more yaml cgdata in order
+  if (hasStableFunctionMap())
+    FunctionMapRecord.deserializeYAML(YOS);
 
   return Error::success();
 }
diff --git a/llvm/lib/CGData/CodeGenDataWriter.cpp b/llvm/lib/CGData/CodeGenDataWriter.cpp
index 5f638be0fefe74..dfe7c3e141b2bf 100644
--- a/llvm/lib/CGData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CGData/CodeGenDataWriter.cpp
@@ -52,6 +52,13 @@ void CodeGenDataWriter::addRecord(OutlinedHashTreeRecord &Record) {
   DataKind |= CGDataKind::FunctionOutlinedHashTree;
 }
 
+void CodeGenDataWriter::addRecord(StableFunctionMapRecord &Record) {
+  assert(Record.StableHashTree && "empty function map in the record");
+  FunctionMapRecord.FunctionMap = std::move(Record.FunctionMap);
+
+  DataKind |= CGDataKind::StableFunctionMergingMap;
+}
+
 Error CodeGenDataWriter::write(raw_fd_ostream &OS) {
   CGDataOStream COS(OS);
   return writeImpl(COS);
@@ -68,8 +75,11 @@ Error CodeGenDataWriter::writeHeader(CGDataOStream &COS) {
   if (static_cast<bool>(DataKind & CGDataKind::FunctionOutlinedHashTree))
     Header.DataKind |=
         static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
-
+  if (static_cast<bool>(DataKind & CGDataKind::StableFunctionMergingMap))
+    Header.DataKind |=
+        static_cast<uint32_t>(CGDataKind::StableFunctionMergingMap);
   Header.OutlinedHashTreeOffset = 0;
+  Header.StableFunctionMapOffset = 0;
 
   // Only write up to the CGDataKind. We need to remember the offset of the
   // remaining fields to allow back-patching later.
@@ -79,10 +89,15 @@ Error CodeGenDataWriter::writeHeader(CGDataOStream &COS) {
 
   // Save the location of Header.OutlinedHashTreeOffset field in \c COS.
   OutlinedHashTreeOffset = COS.tell();
-
   // Reserve the space for OutlinedHashTreeOffset field.
   COS.write(0);
 
+  // Save the location of Header.StableFunctionMapOffset field in \c COS.
+  StableFunctionMapOffset = COS.tell();
+
+  // Reserve the space for StableFunctionMapOffset field.
+  COS.write(0);
+
   return Error::success();
 }
 
@@ -93,10 +108,14 @@ Error CodeGenDataWriter::writeImpl(CGDataOStream &COS) {
   uint64_t OutlinedHashTreeFieldStart = COS.tell();
   if (hasOutlinedHashTree())
     HashTreeRecord.serialize(COS.OS);
+  uint64_t StableFunctionMapFieldStart = COS.tell();
+  if (hasStableFunctionMap())
+    FunctionMapRecord.serialize(COS.OS);
 
   // Back patch the offsets.
   CGDataPatchItem PatchItems[] = {
-      {OutlinedHashTreeOffset, &OutlinedHashTreeFieldStart, 1}};
+      {OutlinedHashTreeOffset, &OutlinedHashTreeFieldStart, 1},
+      {StableFunctionMapOffset, &StableFunctionMapFieldStart, 1}};
   COS.patch(PatchItems);
 
   return Error::success();
@@ -106,7 +125,8 @@ Error CodeGenDataWriter::writeHeaderText(raw_fd_ostream &OS) {
   if (hasOutlinedHashTree())
     OS << "# Outlined stable hash tree\n:outlined_hash_tree\n";
 
-  // TODO: Add more data types in this header
+  if (hasStableFunctionMap())
+    OS << "# Stable function map\n:stable_function_map\n";
 
   return Error::success();
 }
@@ -119,7 +139,8 @@ Error CodeGenDataWriter::writeText(raw_fd_ostream &OS) {
   if (hasOutlinedHashTree())
     HashTreeRecord.serializeYAML(YOS);
 
-  // TODO: Write more yaml cgdata in order
+  if (hasStableFunctionMap())
+    FunctionMapRecord.serializeYAML(YOS);
 
   return Error::success();
 }
diff --git a/llvm/lib/CGData/StableFunctionMapRecord.cpp b/llvm/lib/CGData/StableFunctionMapRecord.cpp
index 90615265f4a73d..805c4b9965fd83 100644
--- a/llvm/lib/CGData/StableFunctionMapRecord.cpp
+++ b/llvm/lib/CGData/StableFunctionMapRecord.cpp
@@ -81,9 +81,16 @@ void StableFunctionMapRecord::serialize(raw_ostream &OS) const {
 
   // Write Names.
   auto &Names = FunctionMap->getNames();
+  uint32_t ByteSize = 4;
   Writer.write<uint32_t>(Names.size());
-  for (auto &Name : Names)
+  for (auto &Name : Names) {
     Writer.OS << Name << '\0';
+    ByteSize += Name.size() + 1;
+  }
+  // Align ByteSize to 4 bytes.
+  uint32_t Padding = offsetToAlignment(ByteSize, Align(4));
+  for (uint32_t I = 0; I < Padding; ++I)
+    Writer.OS << '\0';
 
   // Write StableFunctionEntries whose pointers are sorted.
   auto FuncEntries = getStableFunctionEntries(*FunctionMap);
@@ -108,15 +115,19 @@ void StableFunctionMapRecord::serialize(raw_ostream &OS) const {
 }
 
 void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr) {
-
   // Read Names.
   auto NumNames =
       endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  // Early exit if there is no name.
+  if (NumNames == 0)
+    return;
   for (unsigned I = 0; I < NumNames; ++I) {
     std::string Name(reinterpret_cast<const char *>(Ptr));
     Ptr += Name.size() + 1;
     FunctionMap->getIdOrCreateForName(Name);
   }
+  // Align Ptr to 4 bytes.
+  Ptr = reinterpret_cast<const uint8_t *>(alignAddr(Ptr, Align(4)));
 
   // Read StableFunctionEntries.
   auto NumFuncs =
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
index 70d5ea4b800630..bea78d512a6db7 100644
--- a/llvm/test/tools/llvm-cgdata/empty.test
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -16,7 +16,7 @@ RUN: llvm-cgdata --show %t_emptyheader.cgdata | count 0
 
 # The version number appears when asked, as it's in the header
 RUN: llvm-cgdata --show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
-VERSION: Version: 1
+VERSION: Version: 2
 
 # When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
 RUN: llvm-cgdata --convert %t_emptyheader.cgdata --format text | count 0
@@ -27,9 +27,11 @@ RUN: llvm-cgdata --convert %t_emptyheader.cgdata --format text | count 0
 #   uint32_t Version;
 #   uint32_t DataKind;
 #   uint64_t OutlinedHashTreeOffset;
+#   uint64_t StableFunctionMapOffset;
 # }
 RUN: printf '\xffcgdata\x81' > %t_header.cgdata
-RUN: printf '\x01\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x02\x00\x00\x00' >> %t_header.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_header.cgdata
-RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
 RUN: diff %t_header.cgdata %t_emptyheader.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
index c992174505c1ad..2caa3aef403950 100644
--- a/llvm/test/tools/llvm-cgdata/error.test
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -6,6 +6,7 @@
 #   uint32_t Version;
 #   uint32_t DataKind;
 #   uint64_t OutlinedHashTreeOffset;
+#   uint64_t StableFunctionMapOffset;
 # }
 RUN: touch %t_empty.cgdata
 RUN: not llvm-cgdata --show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix=EMPTY
@@ -21,18 +22,20 @@ RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
 RUN: not llvm-cgdata --show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
 CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
 
-# The current version 1 while the header says 2.
+# The current version 2 while the header says 3.
 RUN: printf '\xffcgdata\x81' > %t_version.cgdata
-RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x03\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
-RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: not llvm-cgdata --show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix=BAD_VERSION
 BAD_VERSION: {{.}}cgdata: unsupported codegen data version
 
 # Header says an outlined hash tree, but the file ends after the header.
 RUN: printf '\xffcgdata\x81' > %t_eof.cgdata
+RUN: printf '\x02\x00\x00\x00' >> %t_eof.cgdata
 RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
-RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
-RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x20\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
 RUN: not llvm-cgdata --show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix=EOF
 EOF: {{.}}cgdata: end of File
diff --git a/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test b/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
new file mode 100644
index 00000000000000..b9bf067d3771c5
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-combined-funcmap-hashtree.test
@@ -0,0 +1,66 @@
+# REQUIRES: shell, aarch64-registered-target
+# UNSUPPORTED: system-windows
+
+# Test merge a single object file having both __llvm_outline and __llvm_merge into a cgdata.
+# Effectively, this test combines merge-hashtree.test and merge-funcmap.test.
+
+RUN: split-file %s %t
+
+# Synthesize raw hashtree bytes without the header (32 byte) from the indexed cgdata.
+RUN: llvm-cgdata --convert --format binary %t/raw-hashtree.cgtext -o %t/raw-hashtree.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-hashtree.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-hashtree-bytes.txt
+
+# Synthesize raw funcmap bytes without the header (32 byte) from the indexed cgdata.
+RUN: llvm-cgdata --convert --format binary %t/raw-funcmap.cgtext -o %t/raw-funcmap.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-funcmap.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-funcmap-bytes.txt
+
+# Synthesize a bitcode file by creating two sections for the hash tree and the function map, respectively.
+RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-hashtree-bytes.txt)/g" %t/merge-both-template.ll > %t/merge-both-hashtree-template.ll
+RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-funcmap-bytes.txt)/g" %t/merge-both-hashtree-template.ll > %t/merge-both-hashtree-funcmap.ll
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-both-hashtree-funcmap.ll -o %t/merge-both-hashtree-funcmap.o
+
+# Merge an object file having cgdata (__llvm_outline and __llvm_merge)
+RUN: llvm-cgdata -m %t/merge-both-hashtree-funcmap.o -o %t/merge-both-hashtree-funcmap.cgdata
+RUN: llvm-cgdata -s %t/merge-both-hashtree-funcmap.cgdata | FileCheck %s
+
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 3
+CHECK-NEXT:  Terminal Node Count: 1
+CHECK-NEXT:  Depth: 2
+CHECK-NEXT: Stable function map:
+CHECK-NEXT:  Unique hash Count: 1
+CHECK-NEXT:  Total function Count: 1
+CHECK-NEXT:  Mergeable function Count: 0
+
+;--- raw-hashtree.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
+
+;--- raw-funcmap.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func1
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        3
+...
+
+;--- merge-both-template.ll
+ at .data1 = private unnamed_addr constant [72 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_outline"
+ at .data2 = private unnamed_addr constant [60 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
new file mode 100644
index 00000000000000..f643c8d92073e3
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-archive.test
@@ -0,0 +1,83 @@
+# REQUIRES: shell, aarch64-registered-target
+# UNSUPPORTED: system-windows
+
+# Merge an archive that has two object files having cgdata (__llvm_merge)
+
+RUN: split-file %s %t
+
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
+RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1-template.ll > %t/merge-1.ll
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
+RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2-template.ll > %t/merge-2.ll
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+
+# Make an archive from two object files
+RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
+
+# Merge the archive into the codegen data file.
+RUN: llvm-cgdata --merge %t/merge-archive.a -o %t/merge-archive.cgdata
+RUN: llvm-cgdata --show %t/merge-archive.cgdata | FileCheck %s
+
+RUN: llvm-cgdata --show %t/merge-archive.cgdata| FileCheck %s
+CHECK: Stable function map:
+CHECK-NEXT:  Unique hash Count: 1
+CHECK-NEXT:  Total function Count: 2
+CHECK-NEXT:  Mergeable function Count: 2
+
+RUN: llvm-cgdata --convert %t/merge-archive.cgdata| FileCheck %s --check-prefix=MAP
+MAP: # Stable function map
+MAP-NEXT: :stable_function_map
+MAP-NEXT: ---
+MAP-NEXT: - Hash:            1
+MAP-NEXT:   FunctionName:    Func1
+MAP-NEXT:   ModuleName:      Mod1
+MAP-NEXT:   InstCount:       2
+MAP-NEXT:   IndexOperandHashes:
+MAP-NEXT:     - InstIndex:       0
+MAP-NEXT:       OpndIndex:       1
+MAP-NEXT:       OpndHash:        3
+MAP-NEXT: - Hash:            1
+MAP-NEXT:   FunctionName:    Func2
+MAP-NEXT:   ModuleName:      Mod1
+MAP-NEXT:   InstCount:       2
+MAP-NEXT:   IndexOperandHashes:
+MAP-NEXT:     - InstIndex:       0
+MAP-NEXT:       OpndIndex:       1
+MAP-NEXT:       OpndHash:        4
+MAP-NEXT: ...
+
+;--- raw-1.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func2
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        4
+...
+
+;--- merge-1-template.ll
+ at .data = private unnamed_addr constant [60 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+
+;--- raw-2.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func1
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        3
+...
+
+;--- merge-2-template.ll
+ at .data = private unnamed_addr constant [60 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
new file mode 100644
index 00000000000000..c8acf1f3916e5a
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-concat.test
@@ -0,0 +1,78 @@
+# REQUIRES: shell, aarch64-registered-target
+# UNSUPPORTED: system-windows
+
+# Merge a binary file (e.g., a linked executable) having concatenated cgdata (__llvm_merge)
+
+RUN: split-file %s %t
+
+# Synthesize two sets of raw cgdata without the header (32 byte) from the indexed cgdata.
+# Concatenate them in merge-concat.ll
+RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat-template.ll > %t/merge-concat-template-2.ll
+RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat-template-2.ll > %t/merge-concat.ll
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
+RUN: llvm-cgdata --merge %t/merge-concat.o -o %t/merge-concat.cgdata
+RUN: llvm-cgdata --show %t/merge-concat.cgdata | FileCheck %s
+
+CHECK: Stable function map:
+CHECK-NEXT:  Unique hash Count: 1
+CHECK-NEXT:  Total function Count: 2
+CHECK-NEXT:  Mergeable function Count: 2
+
+RUN: llvm-cgdata --convert %t/merge-concat.cgdata| FileCheck %s --check-prefix=MAP
+MAP: # Stable function map
+MAP-NEXT: :stable_function_map
+MAP-NEXT: ---
+MAP-NEXT: - Hash:            1
+MAP-NEXT:   FunctionName:    Func1
+MAP-NEXT:   ModuleName:      Mod1
+MAP-NEXT:   InstCount:       2
+MAP-NEXT:   IndexOperandHashes:
+MAP-NEXT:     - InstIndex:       0
+MAP-NEXT:       OpndIndex:       1
+MAP-NEXT:       OpndHash:        3
+MAP-NEXT: - Hash:            1
+MAP-NEXT:   FunctionName:    Func2
+MAP-NEXT:   ModuleName:      Mod1
+MAP-NEXT:   InstCount:       2
+MAP-NEXT:   IndexOperandHashes:
+MAP-NEXT:     - InstIndex:       0
+MAP-NEXT:       OpndIndex:       1
+MAP-NEXT:       OpndHash:        4
+MAP-NEXT: ...
+
+;--- raw-1.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func2
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        4
+...
+
+;--- raw-2.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func1
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        3
+...
+
+;--- merge-concat-template.ll
+
+; In an linked executable (as opposed to an object file), cgdata in __llvm_merge might be concatenated.
+; Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated.
+; In other words, the following two trees are encoded back-to-back in a binary format.
+ at .data1 = private unnamed_addr constant [60 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+ at .data2 = private unnamed_addr constant [60 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
new file mode 100644
index 00000000000000..3ae67f062f820f
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-double.test
@@ -0,0 +1,79 @@
+# REQUIRES: shell, aarch64-registered-target
+# UNSUPPORTED: system-windows
+
+# Merge two object files having cgdata (__llvm_merge)
+
+RUN: split-file %s %t
+
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
+RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1-template.ll > %t/merge-1.ll
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
+RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2-template.ll > %t/merge-2.ll
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+
+# Merge two object files into the codegen data file.
+RUN: llvm-cgdata --merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+
+RUN: llvm-cgdata --show %t/merge.cgdata | FileCheck %s
+CHECK: Stable function map:
+CHECK-NEXT:  Unique hash Count: 1
+CHECK-NEXT:  Total function Count: 2
+CHECK-NEXT:  Mergeable function Count: 2
+
+RUN: llvm-cgdata --convert %t/merge.cgdata | FileCheck %s --check-prefix=MAP
+MAP: # Stable function map
+MAP-NEXT: :stable_function_map
+MAP-NEXT: ---
+MAP-NEXT: - Hash:            1
+MAP-NEXT:   FunctionName:    Func1
+MAP-NEXT:   ModuleName:      Mod1
+MAP-NEXT:   InstCount:       2
+MAP-NEXT:   IndexOperandHashes:
+MAP-NEXT:     - InstIndex:       0
+MAP-NEXT:       OpndIndex:       1
+MAP-NEXT:       OpndHash:        3
+MAP-NEXT: - Hash:            1
+MAP-NEXT:   FunctionName:    Func2
+MAP-NEXT:   ModuleName:      Mod1
+MAP-NEXT:   InstCount:       2
+MAP-NEXT:   IndexOperandHashes:
+MAP-NEXT:     - InstIndex:       0
+MAP-NEXT:       OpndIndex:       1
+MAP-NEXT:       OpndHash:        4
+MAP-NEXT: ...
+
+;--- raw-1.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func2
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        4
+...
+
+;--- merge-1-template.ll
+ at .data = private unnamed_addr constant [60 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
+
+;--- raw-2.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func1
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        3
+...
+
+;--- merge-2-template.ll
+ at .data = private unnamed_addr constant [60 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test b/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
new file mode 100644
index 00000000000000..6a4e635f638657
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-funcmap-single.test
@@ -0,0 +1,36 @@
+# REQUIRES: shell, aarch64-registered-target
+# UNSUPPORTED: system-windows
+
+# Test merge a single object file into a cgdata
+
+RUN: split-file %s %t
+
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
+RUN: llvm-cgdata --convert --format binary %t/raw-single.cgtext -o %t/raw-single.cgdata
+RUN: od -t x1 -j 32 -An %t/raw-single.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-single-bytes.txt
+
+RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-single-bytes.txt)/g" %t/merge-single-template.ll > %t/merge-single.ll
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
+
+# Merge an object file having cgdata (__llvm_merge)
+RUN: llvm-cgdata -m %t/merge-single.o -o %t/merge-single.cgdata
+RUN: llvm-cgdata -s %t/merge-single.cgdata | FileCheck %s
+CHECK: Stable function map:
+CHECK-NEXT:  Unique hash Count: 1
+CHECK-NEXT:  Total function Count: 1
+CHECK-NEXT:  Mergeable function Count: 0
+
+;--- raw-single.cgtext
+:stable_function_map
+- Hash:            1
+  FunctionName:    Func1
+  ModuleName:      Mod1
+  InstCount:       2
+  IndexOperandHashes:
+    - InstIndex:       0
+      OpndIndex:       1
+      OpndHash:        3
+...
+
+;--- merge-single-template.ll
+ at .data = private unnamed_addr constant [60 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_merge"
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-hashtree-archive.test
similarity index 91%
rename from llvm/test/tools/llvm-cgdata/merge-archive.test
rename to llvm/test/tools/llvm-cgdata/merge-hashtree-archive.test
index 03eb9106b54562..ee6345247c5be6 100644
--- a/llvm/test/tools/llvm-cgdata/merge-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-hashtree-archive.test
@@ -5,15 +5,15 @@
 
 RUN: split-file %s %t
 
-# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
 RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: od -t x1 -j 32 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1-template.ll > %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
-# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
 RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: od -t x1 -j 32 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2-template.ll > %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-hashtree-concat.test
similarity index 93%
rename from llvm/test/tools/llvm-cgdata/merge-concat.test
rename to llvm/test/tools/llvm-cgdata/merge-hashtree-concat.test
index ac0e7a6e29e878..5a3ece05a3f990 100644
--- a/llvm/test/tools/llvm-cgdata/merge-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-hashtree-concat.test
@@ -5,13 +5,13 @@
 
 RUN: split-file %s %t
 
-# Synthesize two sets of raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize two sets of raw cgdata without the header (32 byte) from the indexed cgdata.
 # Concatenate them in merge-concat.ll
 RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: od -t x1 -j 32 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat-template.ll > %t/merge-concat-template-2.ll
 RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: od -t x1 -j 32 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat-template-2.ll > %t/merge-concat.ll
 
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-hashtree-double.test
similarity index 90%
rename from llvm/test/tools/llvm-cgdata/merge-double.test
rename to llvm/test/tools/llvm-cgdata/merge-hashtree-double.test
index 1ae8064291019e..044a8649cf4adf 100644
--- a/llvm/test/tools/llvm-cgdata/merge-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-hashtree-double.test
@@ -5,15 +5,15 @@
 
 RUN: split-file %s %t
 
-# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
 RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: od -t x1 -j 32 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1-template.ll > %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
-# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
 RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: od -t x1 -j 32 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2-template.ll > %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-hashtree-single.test
similarity index 92%
rename from llvm/test/tools/llvm-cgdata/merge-single.test
rename to llvm/test/tools/llvm-cgdata/merge-hashtree-single.test
index 47e3cb3f4f50fb..829c63f0f17a2c 100644
--- a/llvm/test/tools/llvm-cgdata/merge-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-hashtree-single.test
@@ -11,9 +11,9 @@ RUN: llvm-cgdata --merge %t/merge-empty.o --output %t/merge-empty.cgdata
 # No summary appear with the header only cgdata.
 RUN: llvm-cgdata --show %t/merge-empty.cgdata | count 0
 
-# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
 RUN: llvm-cgdata --convert --format binary %t/raw-single.cgtext -o %t/raw-single.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-single.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-single-bytes.txt
+RUN: od -t x1 -j 32 -An %t/raw-single.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-single-bytes.txt
 
 RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-single-bytes.txt)/g" %t/merge-single-template.ll > %t/merge-single.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 483f4662631284..18de1f6b14552a 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -80,8 +80,6 @@ static CGDataAction Action;
 static std::optional<CGDataFormat> OutputFormat;
 static std::vector<std::string> InputFilenames;
 
-// TODO: Add a doc, https://llvm.org/docs/CommandGuide/llvm-cgdata.html
-
 static void exitWithError(Twine Message, std::string Whence = "",
                           std::string Hint = "") {
   WithColor::error();
@@ -128,6 +126,10 @@ static int convert_main(int argc, const char *argv[]) {
     OutlinedHashTreeRecord Record(Reader->releaseOutlinedHashTree());
     Writer.addRecord(Record);
   }
+  if (Reader->hasStableFunctionMap()) {
+    StableFunctionMapRecord Record(Reader->releaseStableFunctionMap());
+    Writer.addRecord(Record);
+  }
 
   if (OutputFormat == CGDataFormat::Text) {
     if (Error E = Writer.writeText(OS))
@@ -141,10 +143,12 @@ static int convert_main(int argc, const char *argv[]) {
 }
 
 static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
-                         OutlinedHashTreeRecord &GlobalOutlineRecord);
+                         OutlinedHashTreeRecord &GlobalOutlineRecord,
+                         StableFunctionMapRecord &GlobalFunctionMapRecord);
 
 static bool handleArchive(StringRef Filename, Archive &Arch,
-                          OutlinedHashTreeRecord &GlobalOutlineRecord) {
+                          OutlinedHashTreeRecord &GlobalOutlineRecord,
+                          StableFunctionMapRecord &GlobalFunctionMapRecord) {
   bool Result = true;
   Error Err = Error::success();
   for (const auto &Child : Arch.children(Err)) {
@@ -155,7 +159,8 @@ static bool handleArchive(StringRef Filename, Archive &Arch,
     if (Error E = NameOrErr.takeError())
       exitWithError(std::move(E), Filename);
     std::string Name = (Filename + "(" + NameOrErr.get() + ")").str();
-    Result &= handleBuffer(Name, BuffOrErr.get(), GlobalOutlineRecord);
+    Result &= handleBuffer(Name, BuffOrErr.get(), GlobalOutlineRecord,
+                           GlobalFunctionMapRecord);
   }
   if (Err)
     exitWithError(std::move(Err), Filename);
@@ -163,7 +168,8 @@ static bool handleArchive(StringRef Filename, Archive &Arch,
 }
 
 static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
-                         OutlinedHashTreeRecord &GlobalOutlineRecord) {
+                         OutlinedHashTreeRecord &GlobalOutlineRecord,
+                         StableFunctionMapRecord &GlobalFunctionMapRecord) {
   Expected<std::unique_ptr<object::Binary>> BinOrErr =
       object::createBinary(Buffer);
   if (Error E = BinOrErr.takeError())
@@ -171,11 +177,12 @@ static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
 
   bool Result = true;
   if (auto *Obj = dyn_cast<ObjectFile>(BinOrErr->get())) {
-    if (Error E =
-            CodeGenDataReader::mergeFromObjectFile(Obj, GlobalOutlineRecord))
+    if (Error E = CodeGenDataReader::mergeFromObjectFile(
+            Obj, GlobalOutlineRecord, GlobalFunctionMapRecord))
       exitWithError(std::move(E), Filename);
   } else if (auto *Arch = dyn_cast<Archive>(BinOrErr->get())) {
-    Result &= handleArchive(Filename, *Arch, GlobalOutlineRecord);
+    Result &= handleArchive(Filename, *Arch, GlobalOutlineRecord,
+                            GlobalFunctionMapRecord);
   } else {
     // TODO: Support for the MachO universal binary format.
     errs() << "Error: unsupported binary file: " << Filename << "\n";
@@ -186,19 +193,23 @@ static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
 }
 
 static bool handleFile(StringRef Filename,
-                       OutlinedHashTreeRecord &GlobalOutlineRecord) {
+                       OutlinedHashTreeRecord &GlobalOutlineRecord,
+                       StableFunctionMapRecord &GlobalFunctionMapRecord) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = BuffOrErr.getError())
     exitWithErrorCode(EC, Filename);
-  return handleBuffer(Filename, *BuffOrErr.get(), GlobalOutlineRecord);
+  return handleBuffer(Filename, *BuffOrErr.get(), GlobalOutlineRecord,
+                      GlobalFunctionMapRecord);
 }
 
 static int merge_main(int argc, const char *argv[]) {
   bool Result = true;
   OutlinedHashTreeRecord GlobalOutlineRecord;
+  StableFunctionMapRecord GlobalFunctionMapRecord;
   for (auto &Filename : InputFilenames)
-    Result &= handleFile(Filename, GlobalOutlineRecord);
+    Result &=
+        handleFile(Filename, GlobalOutlineRecord, GlobalFunctionMapRecord);
 
   if (!Result)
     exitWithError("failed to merge codegen data files.");
@@ -206,6 +217,8 @@ static int merge_main(int argc, const char *argv[]) {
   CodeGenDataWriter Writer;
   if (!GlobalOutlineRecord.empty())
     Writer.addRecord(GlobalOutlineRecord);
+  if (!GlobalFunctionMapRecord.empty())
+    Writer.addRecord(GlobalFunctionMapRecord);
 
   std::error_code EC;
   raw_fd_ostream OS(OutputFilename, EC,
@@ -249,6 +262,15 @@ static int show_main(int argc, const char *argv[]) {
        << "\n";
     OS << "  Depth: " << Tree->depth() << "\n";
   }
+  if (Reader->hasStableFunctionMap()) {
+    auto Map = Reader->releaseStableFunctionMap();
+    OS << "Stable function map:\n";
+    OS << "  Unique hash Count: " << Map->size() << "\n";
+    OS << "  Total function Count: "
+       << Map->size(StableFunctionMap::TotalFunctionCount) << "\n";
+    OS << "  Mergeable function Count: "
+       << Map->size(StableFunctionMap::MergeableFunctionCount) << "\n";
+  }
 
   return 0;
 }

>From 67d26c7ba33c682cc4b443c489f1ec270950ef48 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 30 Aug 2024 00:09:09 -0700
Subject: [PATCH 08/10] port global merge func

---
 lld/MachO/CMakeLists.txt                      |   1 +
 lld/MachO/Driver.cpp                          |  19 +-
 lld/MachO/InputSection.h                      |   1 +
 lld/MachO/LTO.cpp                             |   9 +-
 llvm/include/llvm/CGData/CodeGenData.h        |  11 +
 llvm/include/llvm/CGData/StableFunctionMap.h  |  13 +-
 .../llvm/CGData/StableFunctionMapRecord.h     |   3 +
 llvm/include/llvm/IR/StructuralHash.h         |   5 +-
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/include/llvm/LinkAllPasses.h             |   1 +
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |   1 +
 llvm/include/llvm/Transforms/IPO.h            |   2 +
 .../Transforms/IPO/GlobalMergeFunctions.h     |  80 ++
 llvm/lib/CGData/CMakeLists.txt                |   2 +
 llvm/lib/CGData/CodeGenData.cpp               |   2 +
 llvm/lib/CGData/CodeGenDataWriter.cpp         |   2 +-
 llvm/lib/CGData/StableFunctionMap.cpp         |  91 +++
 llvm/lib/CGData/StableFunctionMapRecord.cpp   |   6 +-
 llvm/lib/IR/StructuralHash.cpp                |  10 +-
 llvm/lib/LTO/LTO.cpp                          |   1 +
 llvm/lib/LTO/LTOBackend.cpp                   |   1 +
 llvm/lib/Passes/PassRegistry.def              |   1 +
 llvm/lib/Transforms/IPO/CMakeLists.txt        |   2 +
 .../Transforms/IPO/GlobalMergeFunctions.cpp   | 723 ++++++++++++++++++
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        |   2 +
 25 files changed, 975 insertions(+), 15 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
 create mode 100644 llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp

diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
index ecf6ce609e59f2..f028ff9e806e4a 100644
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -44,6 +44,7 @@ add_lld_library(lldMachO
   Core
   DebugInfoDWARF
   Demangle
+  IPO
   LTO
   MC
   ObjCARCOpts
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index ab4abb1fa97efc..88117c4d41546b 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -1326,7 +1326,8 @@ static void codegenDataGenerate() {
   TimeTraceScope timeScope("Generating codegen data");
 
   OutlinedHashTreeRecord globalOutlineRecord;
-  for (ConcatInputSection *isec : inputSections)
+  StableFunctionMapRecord globalMergeRecord;
+  for (ConcatInputSection *isec : inputSections) {
     if (isec->getSegName() == segment_names::data &&
         isec->getName() == section_names::outlinedHashTree) {
       // Read outlined hash tree from each section.
@@ -1337,11 +1338,25 @@ static void codegenDataGenerate() {
       // Merge it to the global hash tree.
       globalOutlineRecord.merge(localOutlineRecord);
     }
+    if (isec->getSegName() == segment_names::data &&
+        isec->getName() == section_names::functionmap) {
+      // Read stable functions from each section.
+      StableFunctionMapRecord localMergeRecord;
+      auto *data = isec->data.data();
+      localMergeRecord.deserialize(data);
+
+      // Merge it to the global function map.
+      globalMergeRecord.merge(localMergeRecord);
+    }
+  }
+
+  globalMergeRecord.finalize();
 
   CodeGenDataWriter Writer;
   if (!globalOutlineRecord.empty())
     Writer.addRecord(globalOutlineRecord);
-
+  if (!globalMergeRecord.empty())
+    Writer.addRecord(globalMergeRecord);
   std::error_code EC;
   auto fileName = config->codegenDataGeneratePath;
   assert(!fileName.empty());
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index 7ef0e31066f372..b86520d36cda5b 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -339,6 +339,7 @@ constexpr const char const_[] = "__const";
 constexpr const char lazySymbolPtr[] = "__la_symbol_ptr";
 constexpr const char lazyBinding[] = "__lazy_binding";
 constexpr const char literals[] = "__literals";
+constexpr const char functionmap[] = "__llvm_merge";
 constexpr const char moduleInitFunc[] = "__mod_init_func";
 constexpr const char moduleTermFunc[] = "__mod_term_func";
 constexpr const char nonLazySymbolPtr[] = "__nl_symbol_ptr";
diff --git a/lld/MachO/LTO.cpp b/lld/MachO/LTO.cpp
index 6527cbb68f2498..a3aa1587dee16d 100644
--- a/lld/MachO/LTO.cpp
+++ b/lld/MachO/LTO.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/ObjCARC.h"
 
 using namespace lld;
@@ -38,6 +39,8 @@ static std::string getThinLTOOutputFile(StringRef modulePath) {
                                    config->thinLTOPrefixReplaceNew);
 }
 
+extern cl::opt<bool> EnableGlobalMergeFunc;
+
 static lto::Config createConfig() {
   lto::Config c;
   c.Options = initTargetOptionsFromCodeGenFlags();
@@ -48,7 +51,10 @@ static lto::Config createConfig() {
   c.CPU = getCPUStr();
   c.MAttrs = getMAttrs();
   c.DiagHandler = diagnosticHandler;
-
+  c.PreCodeGenPassesHook = [](legacy::PassManager &pm) {
+    if (EnableGlobalMergeFunc)
+      pm.add(createGlobalMergeFuncPass());
+  };
   c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty();
 
   c.TimeTraceEnabled = config->timeTraceEnabled;
@@ -97,7 +103,6 @@ BitcodeCompiler::BitcodeCompiler() {
         onIndexWrite, config->thinLTOEmitIndexFiles,
         config->thinLTOEmitImportsFiles);
   }
-
   ltoObj = std::make_unique<lto::LTO>(createConfig(), backend);
 }
 
diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index ceaa8790aff8d4..fcd91ae02d6dc4 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -143,6 +143,9 @@ class CodeGenData {
   const OutlinedHashTree *getOutlinedHashTree() {
     return PublishedHashTree.get();
   }
+  const StableFunctionMap *getStableFunctionMap() {
+    return PublishedStableFunctionMap.get();
+  }
 
   /// Returns true if we should write codegen data.
   bool emitCGData() { return EmitCGData; }
@@ -167,10 +170,18 @@ inline bool hasOutlinedHashTree() {
   return CodeGenData::getInstance().hasOutlinedHashTree();
 }
 
+inline bool hasStableFunctionMap() {
+  return CodeGenData::getInstance().hasStableFunctionMap();
+}
+
 inline const OutlinedHashTree *getOutlinedHashTree() {
   return CodeGenData::getInstance().getOutlinedHashTree();
 }
 
+inline const StableFunctionMap *getStableFunctionMap() {
+  return CodeGenData::getInstance().getStableFunctionMap();
+}
+
 inline bool emitCGData() { return CodeGenData::getInstance().emitCGData(); }
 
 inline void
diff --git a/llvm/include/llvm/CGData/StableFunctionMap.h b/llvm/include/llvm/CGData/StableFunctionMap.h
index 81b48cdcf66f58..9bf7183daa7945 100644
--- a/llvm/include/llvm/CGData/StableFunctionMap.h
+++ b/llvm/include/llvm/CGData/StableFunctionMap.h
@@ -14,6 +14,7 @@
 #define LLVM_CGDATA_STABLEFUNCTIONMAP_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StableHashing.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/IR/StructuralHash.h"
@@ -43,8 +44,8 @@ struct StableFunction {
   /// A vector of pairs of IndexPair and operand hash which was skipped.
   IndexOperandHashVecType IndexOperandHashes;
 
-  StableFunction(stable_hash Hash, std::string FunctionName,
-                 std::string ModuleName, unsigned InstCount,
+  StableFunction(stable_hash Hash, const std::string FunctionName,
+                 const std::string ModuleName, unsigned InstCount,
                  IndexOperandHashVecType &&IndexOperandHashes)
       : Hash(Hash), FunctionName(FunctionName), ModuleName(ModuleName),
         InstCount(InstCount),
@@ -84,10 +85,12 @@ class StableFunctionMap {
   SmallVector<std::string> IdToName;
   /// A map from StringRef (name) to an ID.
   StringMap<unsigned> NameToId;
+  /// True if the function map is finalized with minimal content.
+  bool Finalized = false;
 
 public:
   /// Get the HashToFuncs map for serialization.
-  const HashFuncsMapType &getFunctionMap() { return HashToFuncs; }
+  const HashFuncsMapType &getFunctionMap() const { return HashToFuncs; }
 
   /// Get the NameToId vector for serialization.
   const SmallVector<std::string> getNames() { return IdToName; }
@@ -108,6 +111,7 @@ class StableFunctionMap {
   /// method assumes that string names have already been uniqued and the
   /// `StableFunctionEntry` is ready for insertion.
   void insert(std::unique_ptr<StableFunctionEntry> FuncEntry) {
+    assert(!Finalized && "Cannot insert after finalization");
     HashToFuncs[FuncEntry->Hash].emplace_back(std::move(FuncEntry));
   }
 
@@ -127,6 +131,9 @@ class StableFunctionMap {
   /// \returns the size of StableFunctionMap.
   /// \p Type is the type of size to return.
   size_t size(SizeType Type = UniqueHashCount) const;
+
+  /// Finalize the stable function mape by trimming content.
+  bool finalize();
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/CGData/StableFunctionMapRecord.h b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
index cbe7e295e461b6..0a38e2df736596 100644
--- a/llvm/include/llvm/CGData/StableFunctionMapRecord.h
+++ b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
@@ -41,6 +41,9 @@ struct StableFunctionMapRecord {
   /// Deserialize the stable function map from a YAML stream.
   void deserializeYAML(yaml::Input &YIS);
 
+  /// Finalize the stable function map by trimming content.
+  void finalize() { FunctionMap->finalize(); }
+
   /// Merge the stable function map into this one.
   void merge(const StableFunctionMapRecord &Other) {
     FunctionMap->merge(*Other.FunctionMap);
diff --git a/llvm/include/llvm/IR/StructuralHash.h b/llvm/include/llvm/IR/StructuralHash.h
index d5fb0017cd3ce0..95baabebb97ac2 100644
--- a/llvm/include/llvm/IR/StructuralHash.h
+++ b/llvm/include/llvm/IR/StructuralHash.h
@@ -26,6 +26,7 @@ class Function;
 class Module;
 
 using IRHash = stable_hash;
+using OpndHash = stable_hash;
 
 /// Returns a hash of the function \p F.
 /// \param F The function to hash.
@@ -46,8 +47,8 @@ using IndexPair = std::pair<unsigned, unsigned>;
 /// A map from an instruction index to an instruction pointer.
 using IndexInstrMap = MapVector<unsigned, Instruction *>;
 
-/// A map from an IndexPair to a stable_hash.
-using IndexOperandHashMapType = DenseMap<IndexPair, stable_hash>;
+/// A map from an IndexPair to an OpndHash.
+using IndexOperandHashMapType = DenseMap<IndexPair, OpndHash>;
 
 /// A function that takes an instruction and an operand index and returns true
 /// if the operand should be ignored in the function hash computation.
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 4352099d6dbb99..9aa36d5bb7f801 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -123,6 +123,7 @@ void initializeGCEmptyBasicBlocksPass(PassRegistry &);
 void initializeGCMachineCodeAnalysisPass(PassRegistry &);
 void initializeGCModuleInfoPass(PassRegistry &);
 void initializeGVNLegacyPassPass(PassRegistry &);
+void initializeGlobalMergeFuncPass(PassRegistry &);
 void initializeGlobalMergePass(PassRegistry &);
 void initializeGlobalsAAWrapperPassPass(PassRegistry &);
 void initializeHardwareLoopsLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 92b59a66567c95..ea3609a2b4bc71 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -79,6 +79,7 @@ struct ForcePassLinking {
     (void)llvm::createDomOnlyViewerWrapperPassPass();
     (void)llvm::createDomViewerWrapperPassPass();
     (void)llvm::createAlwaysInlinerLegacyPass();
+    (void)llvm::createGlobalMergeFuncPass();
     (void)llvm::createGlobalsAAWrapperPass();
     (void)llvm::createInstSimplifyLegacyPass();
     (void)llvm::createInstructionCombiningPass();
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 13bc4700d87029..96b5b815132bc0 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -74,6 +74,7 @@
 #include "llvm/Target/CGPassBuilderOption.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/CFGuard.h"
+#include "llvm/Transforms/IPO/GlobalMergeFunctions.h"
 #include "llvm/Transforms/Scalar/ConstantHoisting.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
index ee0e35aa618325..86a8654f56997c 100644
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -55,6 +55,8 @@ enum class PassSummaryAction {
   Export, ///< Export information to summary.
 };
 
+Pass *createGlobalMergeFuncPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
new file mode 100644
index 00000000000000..0de2d91a647579
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/GlobalMergeFunctions.h
@@ -0,0 +1,80 @@
+//===------ GlobalMergeFunctions.h - Global merge functions -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file defines global merge functions pass and related data structure.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
+#define PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CGData/StableFunctionMap.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include <map>
+#include <mutex>
+
+enum class HashFunctionMode {
+  None,
+  BuildingHashFuncion,
+  UsingHashFunction,
+};
+
+namespace llvm {
+
+// (inst, opnd) indices
+using LocPair = std::pair<unsigned, unsigned>;
+// 64 bit constant hash
+using ConstHash = uint64_t;
+// A map of location pair to constant hash
+using InstOpndIdConstHashMapTy = DenseMap<LocPair, ConstHash>;
+
+// A vector of locations (the pair of (instruction, operand) indices) reachable
+// from a parameter.
+using ParamLocs = SmallVector<LocPair, 4>;
+// A vector of parameters
+using ParamLocsVecTy = SmallVector<ParamLocs, 8>;
+// A map of stable hash to a vector of stable functions
+
+/// GlobalMergeFunc finds functions which only differ by constants in
+/// certain instructions, e.g. resulting from specialized functions of layout
+/// compatible types.
+/// Unlike PikaMergeFunc that directly compares IRs, this uses stable function
+/// hash to find the merge candidate. Similar to the global outliner, we can run
+/// codegen twice to collect function merge candidate in the first round, and
+/// merge functions globally in the second round.
+class GlobalMergeFunc : public ModulePass {
+  HashFunctionMode MergerMode = HashFunctionMode::None;
+
+  std::unique_ptr<StableFunctionMap> LocalFunctionMap;
+  // StableFunctionMap LocalFunctionMap;
+public:
+  static char ID;
+
+  GlobalMergeFunc();
+
+  void initializeMergerMode(const Module &M);
+
+  bool runOnModule(Module &M) override;
+
+  /// Analyze module to create stable function into LocalFunctionMap.
+  void analyze(Module &M);
+
+  /// Emit LocalFunctionMap into __llvm_merge section.
+  void emitFunctionMap(Module &M);
+
+  /// Merge functions in the module using MFI.
+  bool merge(Module &M, const StableFunctionMap *FunctionMap);
+};
+
+} // end namespace llvm
+#endif // PIKA_TRANSFORMS_UTILS_GLOBALMERGEFUNCTIONS_H
diff --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt
index 18f6b81dc1f11a..003173139f36c5 100644
--- a/llvm/lib/CGData/CMakeLists.txt
+++ b/llvm/lib/CGData/CMakeLists.txt
@@ -14,6 +14,8 @@ add_llvm_component_library(LLVMCGData
   intrinsics_gen
 
   LINK_COMPONENTS
+  BitReader
+  BitWriter
   Core
   Support
   Object
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index b89ad66d6ab2ea..60d7ae28f12f87 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -290,6 +290,8 @@ Error mergeCodeGenData(
       return E;
   }
 
+  GlobalStableFunctionMapRecord.finalize();
+
   if (!GlobalOutlineRecord.empty())
     cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
   if (!GlobalStableFunctionMapRecord.empty())
diff --git a/llvm/lib/CGData/CodeGenDataWriter.cpp b/llvm/lib/CGData/CodeGenDataWriter.cpp
index dfe7c3e141b2bf..b97c9138593f7b 100644
--- a/llvm/lib/CGData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CGData/CodeGenDataWriter.cpp
@@ -53,7 +53,7 @@ void CodeGenDataWriter::addRecord(OutlinedHashTreeRecord &Record) {
 }
 
 void CodeGenDataWriter::addRecord(StableFunctionMapRecord &Record) {
-  assert(Record.StableHashTree && "empty function map in the record");
+  assert(Record.FunctionMap && "empty function map in the record");
   FunctionMapRecord.FunctionMap = std::move(Record.FunctionMap);
 
   DataKind |= CGDataKind::StableFunctionMergingMap;
diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp
index 1215dd48e0921e..b4e188e87b5484 100644
--- a/llvm/lib/CGData/StableFunctionMap.cpp
+++ b/llvm/lib/CGData/StableFunctionMap.cpp
@@ -36,6 +36,7 @@ std::optional<std::string> StableFunctionMap::getNameForId(unsigned Id) const {
 }
 
 void StableFunctionMap::insert(const StableFunction &Func) {
+  assert(!Finalized && "Cannot insert after finalization");
   auto FuncNameId = getIdOrCreateForName(Func.FunctionName);
   auto ModuleNameId = getIdOrCreateForName(Func.ModuleName);
   auto IndexOperandHashMap = std::make_unique<IndexOperandHashMapType>();
@@ -48,6 +49,7 @@ void StableFunctionMap::insert(const StableFunction &Func) {
 }
 
 void StableFunctionMap::merge(const StableFunctionMap &OtherMap) {
+  assert(!Finalized && "Cannot merge after finalization");
   for (auto &[Hash, Funcs] : OtherMap.HashToFuncs) {
     auto &ThisFuncs = HashToFuncs[Hash];
     for (auto &Func : Funcs) {
@@ -84,3 +86,92 @@ size_t StableFunctionMap::size(SizeType Type) const {
   }
   return 0;
 }
+
+using ParamLocs = SmallVector<IndexPair>;
+static void removeIdenticalIndexPair(
+    SmallVector<std::unique_ptr<StableFunctionEntry>> &SFS) {
+  auto &RSF = SFS[0];
+  unsigned StableFunctionCount = SFS.size();
+
+  SmallVector<IndexPair> ToDelete;
+  for (auto &[Pair, Hash] : *(RSF->IndexOperandHashMap)) {
+    bool Identical = true;
+    for (unsigned J = 1; J < StableFunctionCount; ++J) {
+      auto &SF = SFS[J];
+      assert(SF->IndexOperandHashMap->count(Pair));
+      auto SHash = (*SF->IndexOperandHashMap)[Pair];
+      if (Hash != SHash) {
+        Identical = false;
+        break;
+      }
+    }
+
+    // No need to parameterize them if the hashes are identical across stable
+    // functions.
+    if (Identical)
+      ToDelete.emplace_back(Pair);
+  }
+
+  for (auto &Pair : ToDelete)
+    for (auto &SF : SFS)
+      SF->IndexOperandHashMap->erase(Pair);
+}
+
+bool StableFunctionMap::finalize() {
+  bool Changed = false;
+
+  for (auto It = HashToFuncs.begin(); It != HashToFuncs.end(); ++It) {
+    auto &[StableHash, SFS] = *It;
+    // No interest if there is no common stable function globally.
+    if (SFS.size() < 2) {
+      HashToFuncs.erase(It);
+      Changed = true;
+      continue;
+    }
+
+    // Group stable functions by ModuleIdentifier.
+    std::stable_sort(SFS.begin(), SFS.end(),
+                     [&](const std::unique_ptr<StableFunctionEntry> &L,
+                         const std::unique_ptr<StableFunctionEntry> &R) {
+                       return *getNameForId(L->ModuleNameId) <
+                              *getNameForId(R->ModuleNameId);
+                     });
+
+    // Consider the first function as the root function.
+    auto &RSF = SFS[0];
+
+    bool IsValid = true;
+    unsigned StableFunctionCount = SFS.size();
+    for (unsigned I = 1; I < StableFunctionCount; ++I) {
+      auto &SF = SFS[I];
+      assert(RSF->Hash == SF->Hash);
+      if (RSF->InstCount != SF->InstCount) {
+        IsValid = false;
+        break;
+      }
+      if (RSF->IndexOperandHashMap->size() != SF->IndexOperandHashMap->size()) {
+        IsValid = false;
+        break;
+      }
+      for (auto &P : *RSF->IndexOperandHashMap) {
+        auto &InstOpndIndex = P.first;
+        if (!SF->IndexOperandHashMap->count(InstOpndIndex)) {
+          IsValid = false;
+          break;
+        }
+      }
+    }
+    if (!IsValid) {
+      HashToFuncs.erase(It);
+      Changed = true;
+      continue;
+    }
+
+    // Trim the index pair that has the same operand hash across
+    // stable functions.
+    removeIdenticalIndexPair(SFS);
+  }
+  Finalized = true;
+
+  return Changed;
+}
diff --git a/llvm/lib/CGData/StableFunctionMapRecord.cpp b/llvm/lib/CGData/StableFunctionMapRecord.cpp
index 805c4b9965fd83..b04c80f9ee03a0 100644
--- a/llvm/lib/CGData/StableFunctionMapRecord.cpp
+++ b/llvm/lib/CGData/StableFunctionMapRecord.cpp
@@ -137,10 +137,12 @@ void StableFunctionMapRecord::deserialize(const unsigned char *&Ptr) {
         endian::readNext<stable_hash, endianness::little, unaligned>(Ptr);
     auto FunctionNameId =
         endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-    assert(FunctionNameId < IdToName.size() && "FunctionNameId out of range");
+    assert(FunctionMap->getNameForId(FunctionNameId) &&
+           "FunctionNameId out of range");
     auto ModuleNameId =
         endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
-    assert(ModuleNameId < IdToName.size() && "ModuleNameId out of range");
+    assert(FunctionMap->getNameForId(ModuleNameId) &&
+           "ModuleNameId out of range");
     auto InstCount =
         endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
 
diff --git a/llvm/lib/IR/StructuralHash.cpp b/llvm/lib/IR/StructuralHash.cpp
index db23786036b4b8..c88728c5f35719 100644
--- a/llvm/lib/IR/StructuralHash.cpp
+++ b/llvm/lib/IR/StructuralHash.cpp
@@ -53,8 +53,12 @@ class StructuralHashImpl {
     }
   }
 
-  // hash_value for APInt should be stable
-  stable_hash hashAPInt(const APInt &I) { return hash_value(I); }
+  stable_hash hashAPInt(const APInt &I) {
+    SmallVector<stable_hash> Hashes;
+    for (unsigned J = 0; J < I.getNumWords(); ++J)
+      Hashes.emplace_back((I.getRawData())[J]);
+    return stable_hash_combine(Hashes);
+  }
 
   stable_hash hashAPFloat(const APFloat &F) {
     SmallVector<stable_hash> Hashes;
@@ -229,7 +233,7 @@ class StructuralHashImpl {
       auto *Op = Inst.getOperand(OpndIdx);
       auto OpndHash = hashOperand(Op);
       if (IgnoreOp && IgnoreOp(&Inst, OpndIdx)) {
-        assert(IndexPairOpndHash);
+        assert(IndexOperandHashMap);
         IndexOperandHashMap->insert({{InstIdx, OpndIdx}, OpndHash});
       } else
         Hashes.emplace_back(OpndHash);
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index b51b908fb28760..e2aa9dbc527da2 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -51,6 +51,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/GlobalMergeFunctions.h"
 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index cf69f4add53a79..805300e920bfd7 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -426,6 +426,7 @@ static void codegen(const Config &Conf, TargetMachine *TM,
       createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
   if (Conf.PreCodeGenPassesHook)
     Conf.PreCodeGenPassesHook(CodeGenPasses);
+
   if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
                               DwoOut ? &DwoOut->os() : nullptr,
                               Conf.CGFileType))
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 3dc7f185f330c5..01a284be78de46 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -75,6 +75,7 @@ MODULE_PASS("hipstdpar-interpose-alloc", HipStdParAllocationInterpositionPass())
 MODULE_PASS("hipstdpar-select-accelerator-code",
             HipStdParAcceleratorCodeSelectionPass())
 MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
+//MODULE_PASS("global-merge-func", GlobalMergeFuncNewPM())
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
 MODULE_PASS("inliner-ml-advisor-release",
             ModuleInlinerWrapperPass(getInlineParams(), true, {},
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 15cb57399d2460..6a36ed64149f93 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMipo
   GlobalDCE.cpp
   GlobalOpt.cpp
   GlobalSplit.cpp
+  GlobalMergeFunctions.cpp
   HotColdSplitting.cpp
   IPO.cpp
   IROutliner.cpp
@@ -60,6 +61,7 @@ add_llvm_component_library(LLVMipo
   Analysis
   BitReader
   BitWriter
+  CGData
   Core
   FrontendOpenMP
   InstCombine
diff --git a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
new file mode 100644
index 00000000000000..5df380868b23ab
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
@@ -0,0 +1,723 @@
+//===---- GlobalMergeFunctions.cpp - Global merge functions -------*- C++ -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a function merge using function hash. Like the pika merge
+// functions, this can merge functions that differ by Constant operands thru
+// parameterizing them. However, instead of directly comparing IR functions,
+// this uses stable function hash to find potential merge candidates.
+// This provides a flexible framework to implement a global function merge with
+// ThinLTO two-codegen rounds. The first codegen round collects stable function
+// hashes, and determines the merge candidates that match the stable function
+// hashes. The set of parameters pointing to different Constants are also
+// computed during the stable function merge. The second codegen round uses this
+// global function info to optimistically create a merged function in each
+// module context to guarantee correct transformation. Similar to the global
+// outliner, the linker's deduplication (ICF) folds the identical merged
+// functions to save the final binary size.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalMergeFunctions.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CGData/CodeGenData.h"
+#include "llvm/CGData/StableFunctionMap.h"
+#include "llvm/CodeGen/MachineStableHash.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/StructuralHash.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <functional>
+#include <tuple>
+#include <vector>
+
+#define DEBUG_TYPE "global-merge-func"
+
+using namespace llvm;
+using namespace llvm::support;
+
+cl::opt<bool> EnableGlobalMergeFunc(
+    "enable-global-merge-func", cl::init(false), cl::Hidden,
+    cl::desc("enable global merge functions (default = off)"));
+
+cl::opt<unsigned> GlobalMergeExtraThreshold(
+    "globalmergefunc-extra-threshold",
+    cl::desc("An extra cost threshold for merging. '0' disables the extra cost "
+             "and benefit analysis."),
+    cl::init(0), cl::Hidden);
+
+cl::opt<bool> DisableCrossModuleGlobalMergeFunc(
+    "disable-cross-module-global-merge-func", cl::init(false), cl::Hidden,
+    cl::desc("disable cross-module global merge functions. When this flag is "
+             "true, only local functions are merged by global merge func."));
+
+STATISTIC(NumMismatchedFunctionHashGlobalMergeFunction,
+          "Number of mismatched function hash for global merge function");
+STATISTIC(NumMismatchedInstCountGlobalMergeFunction,
+          "Number of mismatched instruction count for global merge function");
+STATISTIC(NumMismatchedConstHashGlobalMergeFunction,
+          "Number of mismatched const hash for global merge function");
+STATISTIC(NumMismatchedIRGlobalMergeFunction,
+          "Number of mismatched IR for global merge function");
+STATISTIC(NumMismatchedModuleIdGlobalMergeFunction,
+          "Number of mismatched Module Id for global merge function");
+STATISTIC(
+    NumMismatchedGlobalMergeFunctionCandidates,
+    "Number of mismatched global merge function candidates that are skipped");
+STATISTIC(NumGlobalMergeFunctionCandidates,
+          "Number of global merge function candidates");
+STATISTIC(NumCrossModuleGlobalMergeFunctionCandidates,
+          "Number of cross-module global merge function candidates");
+STATISTIC(NumIdenticalGlobalMergeFunctionCandidates,
+          "Number of global merge function candidates that are identical (no "
+          "parameter)");
+STATISTIC(NumGlobalMergeFunctions,
+          "Number of functions that are actually merged using function hash");
+STATISTIC(NumAnalyzedModues, "Number of modules that are analyzed");
+STATISTIC(NumAnalyzedFunctions, "Number of functions that are analyzed");
+STATISTIC(NumEligibleFunctions, "Number of functions that are eligible");
+
+// A singleton context for diagnostic output.
+static LLVMContext Ctx;
+class GlobalMergeFuncDiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+
+public:
+  GlobalMergeFuncDiagnosticInfo(const Twine &DiagMsg,
+                                DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(DK_Linker, Severity), Msg(DiagMsg) {}
+  void print(DiagnosticPrinter &DP) const override { DP << Msg; }
+};
+
+/// Returns true if the \opIdx operand of \p CI is the callee operand.
+static bool isCalleeOperand(const CallBase *CI, unsigned opIdx) {
+  return &CI->getCalledOperandUse() == &CI->getOperandUse(opIdx);
+}
+
+static bool canParameterizeCallOperand(const CallBase *CI, unsigned opIdx) {
+  if (CI->isInlineAsm())
+    return false;
+  Function *Callee = CI->getCalledOperand()
+                         ? dyn_cast_or_null<Function>(
+                               CI->getCalledOperand()->stripPointerCasts())
+                         : nullptr;
+  if (Callee) {
+    if (Callee->isIntrinsic())
+      return false;
+    // objc_msgSend stubs must be called, and can't have their address taken.
+    if (Callee->getName().starts_with("objc_msgSend$"))
+      return false;
+  }
+  if (isCalleeOperand(CI, opIdx) &&
+      CI->getOperandBundle(LLVMContext::OB_ptrauth).has_value()) {
+    // The operand is the callee and it has already been signed. Ignore this
+    // because we cannot add another ptrauth bundle to the call instruction.
+    return false;
+  }
+  return true;
+}
+
+bool isEligibleInstrunctionForConstantSharing(const Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::Call:
+  case Instruction::Invoke:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool isEligibleOperandForConstantSharing(const Instruction *I, unsigned OpIdx) {
+  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
+
+  if (!isEligibleInstrunctionForConstantSharing(I))
+    return false;
+
+  auto Opnd = I->getOperand(OpIdx);
+  if (!isa<Constant>(Opnd))
+    return false;
+
+  if (const auto *CI = dyn_cast<CallBase>(I))
+    return canParameterizeCallOperand(CI, OpIdx);
+
+  return true;
+}
+
+/// Returns true if function \p F is eligible for merging.
+bool isEligibleFunction(Function *F) {
+  if (F->isDeclaration())
+    return false;
+
+  if (F->hasFnAttribute(llvm::Attribute::NoMerge))
+    return false;
+
+  if (F->hasAvailableExternallyLinkage()) {
+    return false;
+  }
+
+  if (F->getFunctionType()->isVarArg()) {
+    return false;
+  }
+
+  if (F->getCallingConv() == CallingConv::SwiftTail)
+    return false;
+
+  // if function contains callsites with musttail, if we merge
+  // it, the merged function will have the musttail callsite, but
+  // the number of parameters can change, thus the parameter count
+  // of the callsite will mismatch with the function itself.
+  // if (IgnoreMusttailFunction) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      const auto *CB = dyn_cast<CallBase>(&I);
+      if (CB && CB->isMustTailCall())
+        return false;
+    }
+  }
+
+  return true;
+}
+
+static bool
+isEligibleInstrunctionForConstantSharingLocal(const Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Load:
+  case Instruction::Store:
+  case Instruction::Call:
+  case Instruction::Invoke:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool ignoreOp(const Instruction *I, unsigned OpIdx) {
+  assert(OpIdx < I->getNumOperands() && "Invalid operand index");
+
+  if (!isEligibleInstrunctionForConstantSharingLocal(I))
+    return false;
+
+  if (!isa<Constant>(I->getOperand(OpIdx)))
+    return false;
+
+  if (const auto *CI = dyn_cast<CallBase>(I))
+    return canParameterizeCallOperand(CI, OpIdx);
+
+  return true;
+}
+
+// copy from merge functions.cpp
+static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
+  Type *SrcTy = V->getType();
+  if (SrcTy->isStructTy()) {
+    assert(DestTy->isStructTy());
+    assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
+    Value *Result = PoisonValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
+      Value *Element =
+          createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
+                     DestTy->getStructElementType(I));
+
+      Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isStructTy());
+  if (auto *SrcAT = dyn_cast<ArrayType>(SrcTy)) {
+    auto *DestAT = dyn_cast<ArrayType>(DestTy);
+    assert(DestAT);
+    assert(SrcAT->getNumElements() == DestAT->getNumElements());
+    Value *Result = UndefValue::get(DestTy);
+    for (unsigned int I = 0, E = SrcAT->getNumElements(); I < E; ++I) {
+      Value *Element =
+          createCast(Builder, Builder.CreateExtractValue(V, ArrayRef(I)),
+                     DestAT->getElementType());
+
+      Result = Builder.CreateInsertValue(Result, Element, ArrayRef(I));
+    }
+    return Result;
+  }
+  assert(!DestTy->isArrayTy());
+  if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+    return Builder.CreateIntToPtr(V, DestTy);
+  else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+    return Builder.CreatePtrToInt(V, DestTy);
+  else
+    return Builder.CreateBitCast(V, DestTy);
+}
+
+void GlobalMergeFunc::analyze(Module &M) {
+  ++NumAnalyzedModues;
+  for (Function &Func : M) {
+    ++NumAnalyzedFunctions;
+    if (isEligibleFunction(&Func)) {
+      ++NumEligibleFunctions;
+
+      auto FI = llvm::StructuralHashWithDifferences(Func, ignoreOp);
+
+      // TODO: move this vector implementation inside
+      // StructuralHashWithDifferences.
+      IndexOperandHashVecType IndexOperandHashes;
+      for (auto &Pair : *FI.IndexOperandHashMap)
+        IndexOperandHashes.emplace_back(Pair);
+
+      StableFunction SF(FI.FunctionHash, get_stable_name(Func.getName()).str(),
+                        M.getModuleIdentifier(), FI.IndexInstruction->size(),
+                        std::move(IndexOperandHashes));
+
+      LocalFunctionMap->insert(SF);
+    }
+  }
+}
+
+/// Tuple to hold function info to process merging.
+struct FuncMergeInfo {
+  StableFunctionEntry *SF;
+  Function *F;
+  std::unique_ptr<IndexInstrMap> IndexInstruction;
+};
+
+// Given the func info, and the parameterized locations, create and return
+// a new merged function by replacing the original constants with the new
+// parameters.
+static Function *createMergedFunction(FuncMergeInfo &FI,
+                                      ArrayRef<Type *> ConstParamTypes,
+                                      const ParamLocsVecTy &ParamLocsVec) {
+  // Synthesize a new merged function name by appending ".Tgm" to the root
+  // function's name.
+  auto *MergedFunc = FI.F;
+  auto NewFunctionName =
+      MergedFunc->getName().str() + ".Tgm"; // TODO MergeFunctionInfo::Suffix;
+  auto *M = MergedFunc->getParent();
+  assert(!M->getFunction(NewFunctionName));
+
+  FunctionType *OrigTy = MergedFunc->getFunctionType();
+  // Get the original params' types.
+  SmallVector<Type *> ParamTypes(OrigTy->param_begin(), OrigTy->param_end());
+  // Append const parameter types that are passed in.
+  ParamTypes.append(ConstParamTypes.begin(), ConstParamTypes.end());
+  FunctionType *FuncType =
+      FunctionType::get(OrigTy->getReturnType(), ParamTypes, false);
+
+  // Declare a new function
+  Function *NewFunction =
+      Function::Create(FuncType, MergedFunc->getLinkage(), NewFunctionName);
+  if (auto *SP = MergedFunc->getSubprogram())
+    NewFunction->setSubprogram(SP);
+  NewFunction->copyAttributesFrom(MergedFunc);
+  NewFunction->setDLLStorageClass(GlobalValue::DefaultStorageClass);
+
+  NewFunction->setLinkage(GlobalValue::InternalLinkage);
+  NewFunction->addFnAttr(Attribute::NoInline);
+
+  // Add the new function before the root function.
+  M->getFunctionList().insert(MergedFunc->getIterator(), NewFunction);
+
+  // Move the body of MergedFunc into the NewFunction.
+  NewFunction->splice(NewFunction->begin(), MergedFunc);
+
+  // Update the original args by the new args.
+  auto NewArgIter = NewFunction->arg_begin();
+  for (Argument &OrigArg : MergedFunc->args()) {
+    Argument &NewArg = *NewArgIter++;
+    OrigArg.replaceAllUsesWith(&NewArg);
+  }
+
+  // Replace the original Constants by the new args.
+  unsigned NumOrigArgs = MergedFunc->arg_size();
+  for (unsigned ParamIdx = 0; ParamIdx < ParamLocsVec.size(); ++ParamIdx) {
+    Argument *NewArg = NewFunction->getArg(NumOrigArgs + ParamIdx);
+    for (auto [InstIndex, OpndIndex] : ParamLocsVec[ParamIdx]) {
+      auto *Inst = FI.IndexInstruction->lookup(InstIndex);
+      auto *OrigC = Inst->getOperand(OpndIndex);
+      if (OrigC->getType() != NewArg->getType()) {
+        IRBuilder<> Builder(Inst->getParent(), Inst->getIterator());
+        Inst->setOperand(OpndIndex,
+                         createCast(Builder, NewArg, OrigC->getType()));
+      } else
+        Inst->setOperand(OpndIndex, NewArg);
+    }
+  }
+
+  return NewFunction;
+}
+
+// Given the original function (Thunk) and the merged function (ToFunc), create
+// a thunk to the merged function.
+
+static void createThunk(FuncMergeInfo &FI, ArrayRef<Constant *> Params,
+                        Function *ToFunc) {
+  auto *Thunk = FI.F;
+
+  assert(Thunk->arg_size() + Params.size() ==
+         ToFunc->getFunctionType()->getNumParams());
+  Thunk->dropAllReferences();
+
+  BasicBlock *BB = BasicBlock::Create(Thunk->getContext(), "", Thunk);
+  IRBuilder<> Builder(BB);
+
+  SmallVector<Value *> Args;
+  unsigned ParamIdx = 0;
+  FunctionType *ToFuncTy = ToFunc->getFunctionType();
+
+  // Add arguments which are passed through Thunk.
+  for (Argument &AI : Thunk->args()) {
+    Args.push_back(createCast(Builder, &AI, ToFuncTy->getParamType(ParamIdx)));
+    ++ParamIdx;
+  }
+
+  // Add new arguments defined by Params.
+  for (auto *Param : Params) {
+    assert(ParamIdx < ToFuncTy->getNumParams());
+    // FIXME: do not support signing
+    Args.push_back(
+        createCast(Builder, Param, ToFuncTy->getParamType(ParamIdx)));
+    ++ParamIdx;
+  }
+
+  CallInst *CI = Builder.CreateCall(ToFunc, Args);
+  bool isSwiftTailCall = ToFunc->getCallingConv() == CallingConv::SwiftTail &&
+                         Thunk->getCallingConv() == CallingConv::SwiftTail;
+  CI->setTailCallKind(isSwiftTailCall ? llvm::CallInst::TCK_MustTail
+                                      : llvm::CallInst::TCK_Tail);
+  CI->setCallingConv(ToFunc->getCallingConv());
+  CI->setAttributes(ToFunc->getAttributes());
+  if (Thunk->getReturnType()->isVoidTy()) {
+    Builder.CreateRetVoid();
+  } else {
+    Builder.CreateRet(createCast(Builder, CI, Thunk->getReturnType()));
+  }
+}
+
+// Check if the old merged/optimized IndexOperandHashMap is compatible with
+// the current IndexOperandHashMap. OpndHash may not be stable across
+// different builds due to varying modules combined. To address this, one
+// solution could be to relax the hash computation for Const in
+// PikaFunctionHash. However, instead of doing so, we relax the hash check
+// condition by comparing Const hash patterns instead of absolute hash values.
+// For example, let's assume we have three Consts located at idx1, idx3, and
+// idx6, where their corresponding hashes are hash1, hash2, and hash1 in the old
+// merged map below:
+//   Old (Merged): [(idx1, hash1), (idx3, hash2), (idx6, hash1)]
+//   Current: [(idx1, hash1'), (idx3, hash2'), (idx6, hash1')]
+// If the current function also has three Consts in the same locations,
+// with hash sequences hash1', hash2', and hash1' where the first and third
+// are the same as the old hash sequences, we consider them matched.
+static bool checkConstHashCompatible(
+    const DenseMap<IndexPair, OpndHash> &OldInstOpndIndexToConstHash,
+    const DenseMap<IndexPair, OpndHash> &CurrInstOpndIndexToConstHash) {
+
+  DenseMap<OpndHash, OpndHash> OldHashToCurrHash;
+  for (const auto &[Index, OldHash] : OldInstOpndIndexToConstHash) {
+    auto It = CurrInstOpndIndexToConstHash.find(Index);
+    if (It == CurrInstOpndIndexToConstHash.end())
+      return false;
+
+    auto CurrHash = It->second;
+    auto J = OldHashToCurrHash.find(OldHash);
+    if (J == OldHashToCurrHash.end())
+      OldHashToCurrHash.insert({OldHash, CurrHash});
+    else if (J->second != CurrHash)
+      return false;
+  }
+
+  return true;
+}
+
+// Validate the locations pointed by a param has the same hash and Constant.
+static bool checkConstLocationCompatible(const StableFunctionEntry &SF,
+                                         const IndexInstrMap &IndexInstruction,
+                                         const ParamLocsVecTy &ParamLocsVec) {
+  for (auto &ParamLocs : ParamLocsVec) {
+    std::optional<OpndHash> OldHash;
+    std::optional<Constant *> OldConst;
+    for (auto &Loc : ParamLocs) {
+      assert(SF.IndexOperandHashMap->count(Loc));
+      auto CurrHash = SF.IndexOperandHashMap.get()->at(Loc);
+      auto [InstIndex, OpndIndex] = Loc;
+      assert(InstIndex < IndexInstruction.size());
+      const auto *Inst = IndexInstruction.lookup(InstIndex);
+      auto *CurrConst = cast<Constant>(Inst->getOperand(OpndIndex));
+      if (!OldHash) {
+        OldHash = CurrHash;
+        OldConst = CurrConst;
+      } else if (CurrConst != *OldConst || CurrHash != *OldHash)
+        return false;
+    }
+  }
+  return true;
+}
+
+static ParamLocsVecTy
+computeParamInfo(const SmallVector<std::unique_ptr<StableFunctionEntry>> &SFS) {
+  std::map<std::vector<OpndHash>, ParamLocs> HashSeqToLocs;
+  auto &RSF = *SFS[0];
+  unsigned StableFunctionCount = SFS.size();
+
+  for (auto &[IndexPair, Hash] : *RSF.IndexOperandHashMap) {
+    // Const hash sequence across stable functions.
+    // We will allocate a parameter per unique hash squence.
+    // can't use SmallVector as key
+    std::vector<OpndHash> ConstHashSeq;
+    ConstHashSeq.push_back(Hash);
+    bool Identical = true;
+    for (unsigned J = 1; J < StableFunctionCount; ++J) {
+      auto &SF = SFS[J];
+      assert(SF->IndexOperandHashMap->count(IndexPair));
+      auto SHash = (*SF->IndexOperandHashMap)[IndexPair];
+      if (Hash != SHash)
+        Identical = false;
+      ConstHashSeq.push_back(SHash);
+    }
+
+    // we've already minimized the Const hash sequence.
+    (void)(Identical);
+    assert(!Identical &&
+           "Function Map has not been finalized or minimized before");
+
+    // For each unique Const hash sequence (parameter), add the locations.
+    HashSeqToLocs[ConstHashSeq].push_back(IndexPair);
+  }
+
+  ParamLocsVecTy ParamLocsVec;
+  for (auto &[HashSeq, Locs] : HashSeqToLocs) {
+    ParamLocsVec.push_back(std::move(Locs));
+    std::sort(
+        ParamLocsVec.begin(), ParamLocsVec.end(),
+        [&](const ParamLocs &L, const ParamLocs &R) { return L[0] < R[0]; });
+  }
+  return ParamLocsVec;
+}
+
+bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
+  bool Changed = false;
+
+  // Build a map from stable function name to function.
+  StringMap<Function *> StableNameToFuncMap;
+  for (auto &F : M)
+    StableNameToFuncMap[get_stable_name(F.getName())] = &F;
+  // Track merged functions
+  DenseSet<Function *> MergedFunctions;
+
+  auto ModId = M.getModuleIdentifier();
+  for (auto &[Hash, SFS] : FunctionMap->getFunctionMap()) {
+    assert(SFS.size() >= 2);
+    auto ParamLocsVec = computeParamInfo(SFS);
+    LLVM_DEBUG({
+      dbgs() << "[GlobalMergeFunc] Merging hash: " << Hash << " with Params "
+             << ParamLocsVec.size() << "\n";
+    });
+
+    Function *MergedFunc = nullptr;
+    std::string MergedModId;
+    SmallVector<FuncMergeInfo> FuncMergeInfos;
+    for (auto &SF : SFS) {
+      // Get the function from the stable name.
+      auto I = StableNameToFuncMap.find(
+          *FunctionMap->getNameForId(SF->FunctionNameId));
+      if (I == StableNameToFuncMap.end())
+        continue;
+      Function *F = I->second;
+      assert(F);
+      // Skip if the function has been merged before.
+      if (MergedFunctions.count(F))
+        continue;
+      // Consider the function if it is eligible for merging.
+      if (!isEligibleFunction(F))
+        continue;
+
+      auto FI = llvm::StructuralHashWithDifferences(*F, ignoreOp);
+      uint64_t FuncHash = FI.FunctionHash;
+      if (Hash != FuncHash) {
+        ++NumMismatchedFunctionHashGlobalMergeFunction;
+        continue;
+      }
+
+      if (SF->InstCount != FI.IndexInstruction->size()) {
+        ++NumMismatchedInstCountGlobalMergeFunction;
+        continue;
+      }
+      bool HasValidSharedConst = true;
+      for (auto &[Index, Hash] : *SF->IndexOperandHashMap) {
+        auto [InstIndex, OpndIndex] = Index;
+        assert(InstIndex < FI.IndexInstruction->size());
+        auto *Inst = FI.IndexInstruction->lookup(InstIndex);
+        if (!isEligibleOperandForConstantSharing(Inst, OpndIndex)) {
+          HasValidSharedConst = false;
+          break;
+        }
+      }
+      if (!HasValidSharedConst) {
+        ++NumMismatchedConstHashGlobalMergeFunction;
+        continue;
+      }
+      if (!checkConstHashCompatible(*SF->IndexOperandHashMap,
+                                    *FI.IndexOperandHashMap)) {
+        ++NumMismatchedConstHashGlobalMergeFunction;
+        continue;
+      }
+      if (!checkConstLocationCompatible(*SF, *FI.IndexInstruction,
+                                        ParamLocsVec)) {
+        ++NumMismatchedConstHashGlobalMergeFunction;
+        continue;
+      }
+
+      if (MergedFunc) {
+        // Check if the matched functions fall into the same (first) module.
+        // This module check is not strictly necessary as the functions can move
+        // around. We just want to avoid merging functions from different
+        // modules than the first one in the functon map, as they may not end up
+        // with not being ICFed.
+        if (MergedModId != *FunctionMap->getNameForId(SF->ModuleNameId)) {
+          ++NumMismatchedModuleIdGlobalMergeFunction;
+          continue;
+        }
+      } else {
+        MergedFunc = F;
+        MergedModId = *FunctionMap->getNameForId(SF->ModuleNameId);
+      }
+
+      FuncMergeInfos.push_back({SF.get(), F, std::move(FI.IndexInstruction)});
+      MergedFunctions.insert(F);
+    }
+    unsigned FuncMergeInfoSize = FuncMergeInfos.size();
+    if (FuncMergeInfoSize == 0)
+      continue;
+
+    LLVM_DEBUG({
+      dbgs() << "[GlobalMergeFunc] Merging function count " << FuncMergeInfoSize
+             << " in  " << ModId << "\n";
+    });
+    for (auto &FMI : FuncMergeInfos) {
+      Changed = true;
+
+      // We've already validated all locations of constant operands pointed by
+      // the parameters. Just use the first one to bookkeep the original
+      // constants for each parameter
+      SmallVector<Constant *> Params;
+      SmallVector<Type *> ParamTypes;
+      for (auto &ParamLocs : ParamLocsVec) {
+        assert(!ParamLocs.empty());
+        auto &[InstIndex, OpndIndex] = ParamLocs[0];
+        auto *Inst = FMI.IndexInstruction->lookup(InstIndex);
+        auto *Opnd = cast<Constant>(Inst->getOperand(OpndIndex));
+        Params.push_back(Opnd);
+        ParamTypes.push_back(Opnd->getType());
+      }
+
+      // Create a merged function derived from the first function in the current
+      // module context.
+      Function *MergedFunc =
+          createMergedFunction(FMI, ParamTypes, ParamLocsVec);
+
+      LLVM_DEBUG({
+        dbgs() << "[GlobalMergeFunc] Merged function (hash:" << FMI.SF->Hash
+               << ") " << MergedFunc->getName() << " generated from "
+               << FMI.F->getName() << ":\n";
+        MergedFunc->dump();
+      });
+
+      // Create a thunk to the merged function.
+      createThunk(FMI, Params, MergedFunc);
+      LLVM_DEBUG({
+        dbgs() << "[GlobalMergeFunc] Thunk generated: \n";
+        FMI.F->dump();
+      });
+      ++NumGlobalMergeFunctions;
+    }
+  }
+
+  return Changed;
+}
+
+char GlobalMergeFunc::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalMergeFunc, "global-merge-func",
+                      "Global merge function pass", false, false)
+INITIALIZE_PASS_END(GlobalMergeFunc, "global-merge-func",
+                    "Global merge function pass", false, false)
+
+GlobalMergeFunc::GlobalMergeFunc() : ModulePass(ID) {
+  initializeGlobalMergeFuncPass(*llvm::PassRegistry::getPassRegistry());
+}
+
+namespace llvm {
+Pass *createGlobalMergeFuncPass() { return new GlobalMergeFunc(); }
+} // namespace llvm
+
+void GlobalMergeFunc::initializeMergerMode(const Module &M) {
+  // When codegen data write is enabled, we want to write the local outlined
+  // hash tree to the custom section, `__llvm_outline`.
+  // When the outlined hash tree is available from the previous codegen data,
+  // we want to read it to optimistically create global outlining candidates.
+  LocalFunctionMap = std::make_unique<StableFunctionMap>();
+  if (cgdata::emitCGData())
+    MergerMode = HashFunctionMode::BuildingHashFuncion;
+  else if (cgdata::hasStableFunctionMap())
+    MergerMode = HashFunctionMode::UsingHashFunction;
+}
+
+void GlobalMergeFunc::emitFunctionMap(Module &M) {
+  if (!LocalFunctionMap->empty()) {
+    LLVM_DEBUG({
+      dbgs() << "Emit function map. Size: " << LocalFunctionMap->size() << "\n";
+    });
+    SmallVector<char> Buf;
+    raw_svector_ostream OS(Buf);
+
+    StableFunctionMapRecord SFR(std::move(LocalFunctionMap));
+    SFR.serialize(OS);
+
+    llvm::StringRef Data(Buf.data(), Buf.size());
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+        Data, "in-memory stable function map", false);
+
+    Triple TT(M.getTargetTriple());
+    embedBufferInModule(
+        M, *Buffer.get(),
+        getCodeGenDataSectionName(CG_merge, TT.getObjectFormat()));
+  }
+}
+
+bool GlobalMergeFunc::runOnModule(Module &M) {
+  bool Changed = false;
+  initializeMergerMode(M);
+
+  switch (MergerMode) {
+  case HashFunctionMode::BuildingHashFuncion: {
+    analyze(M);
+    emitFunctionMap(M);
+    break;
+  }
+  case HashFunctionMode::UsingHashFunction: {
+    // Merge with the global function map read from CG data.
+    Changed = merge(M, cgdata::getStableFunctionMap());
+    break;
+  }
+  default: {
+    // Local merge case (within a module) in one-pass without explicitly using
+    // CGData.
+    analyze(M);
+    LocalFunctionMap->finalize();
+    Changed = merge(M, LocalFunctionMap.get());
+  }
+  }
+
+  return Changed;
+}
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 18de1f6b14552a..0931cad4bcb7ed 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -214,6 +214,8 @@ static int merge_main(int argc, const char *argv[]) {
   if (!Result)
     exitWithError("failed to merge codegen data files.");
 
+  GlobalFunctionMapRecord.finalize();
+
   CodeGenDataWriter Writer;
   if (!GlobalOutlineRecord.empty())
     Writer.addRecord(GlobalOutlineRecord);

>From e5ace714e5424afd855c678344bce0aca13ba2ac Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 21 Sep 2024 00:57:45 -0700
Subject: [PATCH 09/10] allow not finish

---
 lld/test/MachO/cgdata-generate.s                 |  6 +++---
 llvm/lib/CGData/StableFunctionMap.cpp            |  3 +++
 llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp | 11 +++++++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/lld/test/MachO/cgdata-generate.s b/lld/test/MachO/cgdata-generate.s
index 174df39d666c5d..f942ae07f64e0e 100644
--- a/lld/test/MachO/cgdata-generate.s
+++ b/lld/test/MachO/cgdata-generate.s
@@ -3,12 +3,12 @@
 
 # RUN: rm -rf %t; split-file %s %t
 
-# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize raw cgdata without the header (32 byte) from the indexed cgdata.
 # RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
-# RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ][ ]*/ /g; s/^[ ]*//; s/[ ]*$//; s/[ ]/,0x/g; s/^/0x/' > %t/raw-1-bytes.txt
+# RUN: od -t x1 -j 32 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ][ ]*/ /g; s/^[ ]*//; s/[ ]*$//; s/[ ]/,0x/g; s/^/0x/' > %t/raw-1-bytes.txt
 # RUN: sed "s/<RAW_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-template.s > %t/merge-1.s
 # RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
-# RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ][ ]*/ /g; s/^[ ]*//; s/[ ]*$//; s/[ ]/,0x/g; s/^/0x/' > %t/raw-2-bytes.txt
+# RUN: od -t x1 -j 32 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ][ ]*/ /g; s/^[ ]*//; s/[ ]*$//; s/[ ]/,0x/g; s/^/0x/' > %t/raw-2-bytes.txt
 # RUN: sed "s/<RAW_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-template.s > %t/merge-2.s
 
 # RUN: llvm-mc -filetype obj -triple arm64-apple-darwin %t/merge-1.s -o %t/merge-1.o
diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp
index b4e188e87b5484..1e072700e179ef 100644
--- a/llvm/lib/CGData/StableFunctionMap.cpp
+++ b/llvm/lib/CGData/StableFunctionMap.cpp
@@ -118,6 +118,9 @@ static void removeIdenticalIndexPair(
 }
 
 bool StableFunctionMap::finalize() {
+  // TODO: Add an option for finalization.
+  return false;
+
   bool Changed = false;
 
   for (auto It = HashToFuncs.begin(); It != HashToFuncs.end(); ++It) {
diff --git a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
index 5df380868b23ab..7ae78f290f411c 100644
--- a/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalMergeFunctions.cpp
@@ -487,10 +487,14 @@ computeParamInfo(const SmallVector<std::unique_ptr<StableFunctionEntry>> &SFS) {
       ConstHashSeq.push_back(SHash);
     }
 
-    // we've already minimized the Const hash sequence.
+// we've already minimized the Const hash sequence.
+#if 0
     (void)(Identical);
     assert(!Identical &&
            "Function Map has not been finalized or minimized before");
+#endif
+    if (Identical)
+      continue;
 
     // For each unique Const hash sequence (parameter), add the locations.
     HashSeqToLocs[ConstHashSeq].push_back(IndexPair);
@@ -518,7 +522,10 @@ bool GlobalMergeFunc::merge(Module &M, const StableFunctionMap *FunctionMap) {
 
   auto ModId = M.getModuleIdentifier();
   for (auto &[Hash, SFS] : FunctionMap->getFunctionMap()) {
-    assert(SFS.size() >= 2);
+    // assert(SFS.size() >= 2);
+    // No finalized merge info might have a single function candidate.
+    if (SFS.size() < 2)
+      continue;
     auto ParamLocsVec = computeParamInfo(SFS);
     LLVM_DEBUG({
       dbgs() << "[GlobalMergeFunc] Merging hash: " << Hash << " with Params "

>From a65cbc21cfd7e6153cc948a8a1311f4f5c176029 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 21 Sep 2024 07:47:27 -0700
Subject: [PATCH 10/10] fixformat

---
 llvm/include/llvm/CGData/StableFunctionMap.h       | 1 -
 llvm/include/llvm/CGData/StableFunctionMapRecord.h | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CGData/StableFunctionMap.h b/llvm/include/llvm/CGData/StableFunctionMap.h
index 9bf7183daa7945..87736373149efb 100644
--- a/llvm/include/llvm/CGData/StableFunctionMap.h
+++ b/llvm/include/llvm/CGData/StableFunctionMap.h
@@ -26,7 +26,6 @@
 
 namespace llvm {
 
-
 using IndexPairHash = std::pair<IndexPair, stable_hash>;
 using IndexOperandHashVecType = SmallVector<IndexPairHash>;
 
diff --git a/llvm/include/llvm/CGData/StableFunctionMapRecord.h b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
index 0a38e2df736596..94a6154cad336c 100644
--- a/llvm/include/llvm/CGData/StableFunctionMapRecord.h
+++ b/llvm/include/llvm/CGData/StableFunctionMapRecord.h
@@ -27,7 +27,7 @@ struct StableFunctionMapRecord {
     FunctionMap = std::make_unique<StableFunctionMap>();
   }
   StableFunctionMapRecord(std::unique_ptr<StableFunctionMap> FunctionMap)
-      : FunctionMap(std::move(FunctionMap)){};
+      : FunctionMap(std::move(FunctionMap)) {}
 
   /// Serialize the stable function map to a raw_ostream.
   void serialize(raw_ostream &OS) const;