[clang] [llvm] Thin9 (PR #110483)

Kyungwoo Lee via cfe-commits cfe-commits at lists.llvm.org
Mon Sep 30 03:28:58 PDT 2024


https://github.com/kyulee-com created https://github.com/llvm/llvm-project/pull/110483

None

>From c1a0219457a3c162d7fa6b9d70750ba7a040d9f2 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 26 Apr 2024 20:02:52 -0700
Subject: [PATCH 1/7] [ThinLTO][NFC] Prep for two-codegen rounds

---
 clang/lib/CodeGen/BackendUtil.cpp  |  8 ++--
 llvm/include/llvm/LTO/LTOBackend.h |  1 +
 llvm/lib/LTO/LTO.cpp               | 75 ++++++++++++++++--------------
 llvm/lib/LTO/LTOBackend.cpp        |  6 ++-
 4 files changed, 49 insertions(+), 41 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 7fa69420298160..a1909d45b4d944 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1286,10 +1286,10 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E =
-          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
-                      /* ModuleMap */ nullptr, CGOpts.CmdArgs)) {
+  if (Error E = thinBackend(
+          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+          ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+          /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index de89f4bb10dff2..8516398510d4b8 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> *ModuleMap,
+                  bool CodeGenOnly,
                   const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index a88124dacfaefd..f4c25f80811a85 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1473,7 +1473,8 @@ class InProcessThinBackend : public ThinBackendProc {
         return MOrErr.takeError();
 
       return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
-                         ImportList, DefinedGlobals, &ModuleMap);
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         Conf.CodeGenOnly);
     };
 
     auto ModuleID = BM.getModuleIdentifier();
@@ -1839,45 +1840,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   TimeTraceScopeExit.release();
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  auto ProcessOneModule = [&](int I) -> Error {
-    auto &Mod = *(ModuleMap.begin() + I);
-    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
-    // combined module and parallel code generation partitions.
-    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
-                              Mod.second, ImportLists[Mod.first],
-                              ExportLists[Mod.first], ResolvedODR[Mod.first],
-                              ThinLTO.ModuleMap);
+  auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error {
+    auto ProcessOneModule = [&](int I) -> Error {
+      auto &Mod = *(ModuleMap.begin() + I);
+      // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+      // combined module and parallel code generation partitions.
+      return BackendProcess->start(
+          RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second,
+          ImportLists[Mod.first], ExportLists[Mod.first],
+          ResolvedODR[Mod.first], ThinLTO.ModuleMap);
+    };
+
+    if (BackendProcess->getThreadCount() == 1) {
+      // Process the modules in the order they were provided on the
+      // command-line. It is important for this codepath to be used for
+      // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
+      // ThinLTO objects in the same order as the inputs, which otherwise would
+      // affect the final link order.
+      for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+        if (Error E = ProcessOneModule(I))
+          return E;
+    } else {
+      // When executing in parallel, process largest bitsize modules first to
+      // improve parallelism, and avoid starving the thread pool near the end.
+      // This saves about 15 sec on a 36-core machine while link `clang.exe`
+      // (out of 100 sec).
+      std::vector<BitcodeModule *> ModulesVec;
+      ModulesVec.reserve(ModuleMap.size());
+      for (auto &Mod : ModuleMap)
+        ModulesVec.push_back(&Mod.second);
+      for (int I : generateModulesOrdering(ModulesVec))
+        if (Error E = ProcessOneModule(I))
+          return E;
+    }
+    return BackendProcess->wait();
   };
 
-  if (BackendProc->getThreadCount() == 1) {
-    // Process the modules in the order they were provided on the command-line.
-    // It is important for this codepath to be used for WriteIndexesThinBackend,
-    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
-    // order as the inputs, which otherwise would affect the final link order.
-    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
-      if (Error E = ProcessOneModule(I))
-        return E;
-  } else {
-    // When executing in parallel, process largest bitsize modules first to
-    // improve parallelism, and avoid starving the thread pool near the end.
-    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
-    // of 100 sec).
-    std::vector<BitcodeModule *> ModulesVec;
-    ModulesVec.reserve(ModuleMap.size());
-    for (auto &Mod : ModuleMap)
-      ModulesVec.push_back(&Mod.second);
-    for (int I : generateModulesOrdering(ModulesVec))
-      if (Error E = ProcessOneModule(I))
-        return E;
-  }
-  return BackendProc->wait();
+  std::unique_ptr<ThinBackendProc> BackendProc =
+      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                      AddStream, Cache);
+  return RunBackends(BackendProc.get());
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 4e58cd369c3ac9..880567989baffb 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -565,7 +565,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
-                       const std::vector<uint8_t> &CmdArgs) {
+                       bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -586,7 +586,9 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   Mod.setPartialSampleProfileRatio(CombinedIndex);
 
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
-  if (Conf.CodeGenOnly) {
+  if (CodeGenOnly) {
+    // If CodeGenOnly is set, we only perform code generation and skip
+    // optimization.
     codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }

>From a566ab01485da1446431f449bee88ab0b8d558f1 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 13 Sep 2024 08:51:00 -0700
Subject: [PATCH 2/7] [CGData][ThinLTO] Global Outlining with Two-CodeGen
 Rounds

---
 llvm/include/llvm/CGData/CodeGenData.h        |  16 +++
 llvm/lib/CGData/CodeGenData.cpp               |  81 +++++++++++++-
 llvm/lib/LTO/CMakeLists.txt                   |   1 +
 llvm/lib/LTO/LTO.cpp                          | 103 +++++++++++++++++-
 llvm/lib/LTO/LTOBackend.cpp                   |  11 ++
 .../test/ThinLTO/AArch64/cgdata-two-rounds.ll |  94 ++++++++++++++++
 llvm/test/ThinLTO/AArch64/lit.local.cfg       |   2 +
 7 files changed, 302 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 84133a433170fe..1e1afe99327650 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -164,6 +164,22 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
+/// Initialize the two-codegen rounds.
+void initializeTwoCodegenRounds();
+
+/// Save the current module before the first codegen round.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
+
+/// Load the current module before the second codegen round.
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context);
+
+/// Merge the codegen data from the input files in scratch vector in ThinLTO
+/// two-codegen rounds.
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
+
 void warn(Error E, StringRef Whence = "");
 void warn(Twine Message, std::string Whence = "", std::string Hint = "");
 
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 55d2504231c744..ff8e5dd7c75790 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/WithColor.h"
 
 #define DEBUG_TYPE "cg-data"
@@ -30,6 +31,14 @@ cl::opt<bool>
 cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+cl::opt<bool> CodeGenDataThinLTOTwoRounds(
+    "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
+    cl::desc("Enable two-round ThinLTO code generation. The first round "
+             "emits codegen data, while the second round uses the emitted "
+             "codegen data for further optimizations."));
+
+// Path to where the optimized bitcodes are saved and restored for ThinLTO.
+static SmallString<128> CodeGenDataThinLTOTwoRoundsPath;
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
@@ -139,7 +148,7 @@ CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
     Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
-    if (CodeGenDataGenerate)
+    if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds)
       Instance->EmitCGData = true;
     else if (!CodeGenDataUsePath.empty()) {
       // Initialize the global CGData if the input file name is given.
@@ -215,6 +224,76 @@ void warn(Error E, StringRef Whence) {
   }
 }
 
+static std::string getPath(StringRef Dir, unsigned Task) {
+  return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+}
+
+void initializeTwoCodegenRounds() {
+  assert(CodeGenDataThinLTOTwoRounds);
+  if (auto EC = llvm::sys::fs::createUniqueDirectory(
+          "cgdata", CodeGenDataThinLTOTwoRoundsPath))
+    report_fatal_error(Twine("Failed to create directory: ") + EC.message());
+}
+
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to save optimized bitcode: " + EC.message());
+  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+}
+
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  auto FileOrError = MemoryBuffer::getFile(Path);
+  if (auto EC = FileOrError.getError())
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to load optimized bitcode: " + EC.message());
+
+  std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+  auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+  if (!RestoredModule)
+    report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
+                       Path + "\n");
+
+  // Restore the original module identifier.
+  (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
+  return std::move(*RestoredModule);
+}
+
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
+
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &InputFile : *(InputFiles)) {
+    if (InputFile.empty())
+      continue;
+    StringRef File = StringRef(InputFile.data(), InputFile.size());
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+        File, "in-memory object file", /*RequiresNullTerminator=*/false);
+    Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
+        object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
+    if (!BinOrErr)
+      return BinOrErr.takeError();
+
+    std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
+                                                        GlobalOutlineRecord))
+      return E;
+  }
+
+  if (!GlobalOutlineRecord.empty())
+    cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+
+  return Error::success();
+}
+
 } // end namespace cgdata
 
 } // end namespace llvm
diff --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt
index 69ff08e1f374c4..057d73b6349cf1 100644
--- a/llvm/lib/LTO/CMakeLists.txt
+++ b/llvm/lib/LTO/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO
   BinaryFormat
   BitReader
   BitWriter
+  CGData
   CodeGen
   CodeGenTypes
   Core
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index f4c25f80811a85..945f8c859365ea 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -70,6 +71,8 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
 namespace llvm {
 /// Enable global value internalization in LTO.
 cl::opt<bool> EnableLTOInternalization(
@@ -1458,7 +1461,7 @@ class InProcessThinBackend : public ThinBackendProc {
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
   }
 
-  Error runThinLTOBackendThread(
+  virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
       ModuleSummaryIndex &CombinedIndex,
       const FunctionImporter::ImportMapTy &ImportList,
@@ -1559,6 +1562,60 @@ class InProcessThinBackend : public ThinBackendProc {
     return BackendThreadPool.getMaxConcurrency();
   }
 };
+
+/// This Backend will run ThinBackend process but throw away all the output from
+/// the codegen. This class facilitates the first codegen round.
+class NoOutputThinBackend : public InProcessThinBackend {
+public:
+  NoOutputThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
+      : InProcessThinBackend(
+            Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
+            // Allocate a scratch buffer for each task to write output to.
+            [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
+              return std::make_unique<CachedFileStream>(
+                  std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
+            },
+            FileCache(), nullptr, false, false),
+        Scratch(std::move(Scratch)) {}
+
+  /// Scratch space for writing output during the codegen.
+  std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
+};
+
+/// This Backend performs codegen on bitcode that was previously saved after
+/// going through optimization. This class facilitates the second codegen round.
+class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+public:
+  OptimizedBitcodeThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn AddStream)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, AddStream, FileCache(),
+                             nullptr, false, false) {}
+
+  virtual Error runThinLTOBackendThread(
+      AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+      ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    LTOLLVMContext BackendContext(Conf);
+    std::unique_ptr<Module> LoadedModule =
+        cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
+
+    return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+                       ImportList, DefinedGlobals, &ModuleMap,
+                       /*CodeGenOnly=*/true);
+  }
+};
 } // end anonymous namespace
 
 ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1879,10 +1936,46 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     return BackendProcess->wait();
   };
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-  return RunBackends(BackendProc.get());
+  if (!CodeGenDataThinLTOTwoRounds) {
+    std::unique_ptr<ThinBackendProc> BackendProc =
+        ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                        AddStream, Cache);
+    return RunBackends(BackendProc.get());
+  }
+
+  // Perform two rounds of code generation for ThinLTO:
+  // 1. First round: Run optimization and code generation with a scratch output.
+  // 2. Merge codegen data extracted from the scratch output.
+  // 3. Second round: Run code generation again using the merged data.
+  LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
+
+  // Initialize a temporary path to store and retrieve optimized IRs for
+  // two-round code generation.
+  cgdata::initializeTwoCodegenRounds();
+
+  // Create a scratch output to hold intermediate results.
+  auto Outputs =
+      std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
+  auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+      Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+      ModuleToDefinedGVSummaries, std::move(Outputs));
+  // First round: Run optimization and code generation with a scratch output.
+  // Before code generation, serialize modules.
+  if (Error E = RunBackends(FirstRoundLTO.get()))
+    return E;
+
+  // Merge codegen data extracted from the scratch output.
+  if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
+    return E;
+
+  // Second round: Run code generation by reading IRs.
+  std::unique_ptr<ThinBackendProc> SecondRoundLTO =
+      std::make_unique<OptimizedBitcodeThinBackend>(
+          Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+          ModuleToDefinedGVSummaries, AddStream);
+  Error E = RunBackends(SecondRoundLTO.get());
+
+  return E;
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 880567989baffb..d198e8e5102009 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged(
     cl::desc("Assume the input has already undergone ThinLTO function "
              "importing and the other pre-optimization pipeline changes."));
 
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
 namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
@@ -599,11 +602,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   auto OptimizeAndCodegen =
       [&](Module &Mod, TargetMachine *TM,
           std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) {
+        // Perform optimization and code generation for ThinLTO.
         if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
                  /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
                  CmdArgs))
           return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
+        // Save the current module before the first codegen round.
+        // Note that the second codegen round runs only `codegen()` without
+        // running `opt()`. We're not reaching here as it's bailed out earlier
+        // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`.
+        if (CodeGenDataThinLTOTwoRounds)
+          cgdata::saveModuleForTwoRounds(Mod, Task);
+
         codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       };
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
new file mode 100644
index 00000000000000..0e082cf4e55e54
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
@@ -0,0 +1,94 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; by running two codegen rounds.
+
+; RUN: split-file %s %t
+
+; Verify each outlining instance is singleton with the global outlining for thinlto.
+; They will be identical, which can be folded by the linker with ICF.
+; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc
+; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1
+; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  b
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+
+; Now add a lto module to the above thinlto modules.
+; Verify the lto module is optimized independent of the global outlining for thinlto.
+; RUN: opt %t/lto.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; lto.ll will have one outlining instance within the lto module itself (no global outlining).
+; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0
+; LTO-0: _OUTLINED_FUNCTION{{.*}}>:
+; LTO-0-NEXT:  mov
+; LTO-0-NEXT:  b
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2
+
+;--- thin-one.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- lto.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 10, i32 30, i32 2);
+  ret i32 %1
+}
+define i32 @f5() minsize {
+  %1 = call i32 @g(i32 20, i32 40, i32 2);
+  ret i32 %1
+}
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 50, i32 60, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg
new file mode 100644
index 00000000000000..10d4a0e953ed47
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True

>From 011d4c112bebc1a93fa31e40b2ee5ccb3b785077 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Tue, 17 Sep 2024 18:07:49 -0700
Subject: [PATCH 3/7] Address comments from ellishg

---
 llvm/include/llvm/CGData/CodeGenData.h | 7 ++++---
 llvm/include/llvm/LTO/LTOBackend.h     | 3 ++-
 llvm/lib/CGData/CodeGenData.cpp        | 4 +++-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 1e1afe99327650..72b52e6e9b8fd1 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -164,13 +164,14 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
-/// Initialize the two-codegen rounds.
 void initializeTwoCodegenRounds();
 
-/// Save the current module before the first codegen round.
+/// Save \p TheModule before the first codegen round.
+/// \p Task represents the partition number in the parallel code generation
+/// process.
 void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
 
-/// Load the current module before the second codegen round.
+/// Load the optimized module before the second codegen round.
 std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
                                                unsigned Task,
                                                LLVMContext &Context);
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index 8516398510d4b8..098c0491dfe70a 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -50,7 +50,8 @@ Error backend(const Config &C, AddStreamFn AddStream,
 /// already been mapped to memory and the corresponding BitcodeModule objects
 /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will
 /// be mapped to memory on demand and at any given time during importing, only
-/// one source module will be kept open at the most.
+/// one source module will be kept open at the most. If \p CodeGenOnly is true,
+/// the backend will skip optimization and only perform code generation.
 Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   Module &M, const ModuleSummaryIndex &CombinedIndex,
                   const FunctionImporter::ImportMapTy &ImportList,
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index ff8e5dd7c75790..58b92b7262957a 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -225,7 +225,9 @@ void warn(Error E, StringRef Whence) {
 }
 
 static std::string getPath(StringRef Dir, unsigned Task) {
-  return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+  llvm::SmallString<128> Path(Dir);
+  llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc");
+  return std::string(Path);
 }
 
 void initializeTwoCodegenRounds() {

>From e402d60c6206c585495123dd327b2a5ab85982b4 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Tue, 17 Sep 2024 23:37:51 -0700
Subject: [PATCH 4/7] Address comments from NuriAmari

---
 llvm/lib/CGData/CodeGenData.cpp |  4 ++--
 llvm/lib/LTO/LTO.cpp            | 33 +++++++++++++++++++++------------
 llvm/lib/LTO/LTOBackend.cpp     |  2 +-
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 58b92b7262957a..4e21045a67cba6 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -245,7 +245,7 @@ void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
   if (EC)
     report_fatal_error(Twine("Failed to open ") + Path +
                        " to save optimized bitcode: " + EC.message());
-  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+  WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true);
 }
 
 std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
@@ -259,7 +259,7 @@ std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
                        " to load optimized bitcode: " + EC.message());
 
   std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
-  auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+  auto RestoredModule = parseBitcodeFile(*FileBuffer, Context);
   if (!RestoredModule)
     report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
                        Path + "\n");
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 945f8c859365ea..b51b908fb28760 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1563,11 +1563,14 @@ class InProcessThinBackend : public ThinBackendProc {
   }
 };
 
-/// This Backend will run ThinBackend process but throw away all the output from
-/// the codegen. This class facilitates the first codegen round.
-class NoOutputThinBackend : public InProcessThinBackend {
+/// This backend is utilized in the first round of a two-codegen round process.
+/// It first saves optimized bitcode files to disk before the codegen process
+/// begins. After codegen, it stores the resulting object files in a scratch
+/// buffer. Note the codegen data stored in the scratch buffer will be extracted
+/// and merged in the subsequent step.
+class FirstRoundThinBackend : public InProcessThinBackend {
 public:
-  NoOutputThinBackend(
+  FirstRoundThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
@@ -1579,25 +1582,31 @@ class NoOutputThinBackend : public InProcessThinBackend {
               return std::make_unique<CachedFileStream>(
                   std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
             },
-            FileCache(), nullptr, false, false),
+            FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false,
+            /*ShouldEmitImportsFiles=*/false),
         Scratch(std::move(Scratch)) {}
 
   /// Scratch space for writing output during the codegen.
   std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
 };
 
-/// This Backend performs codegen on bitcode that was previously saved after
-/// going through optimization. This class facilitates the second codegen round.
-class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+/// This backend operates in the second round of a two-codegen round process.
+/// It starts by reading the optimized bitcode files that were saved during the
+/// first round. The backend then executes the codegen only to further optimize
+/// the code, utilizing the codegen data merged from the first round. Finally,
+/// it writes the resulting object files as usual.
+class SecondRoundThinBackend : public InProcessThinBackend {
 public:
-  OptimizedBitcodeThinBackend(
+  SecondRoundThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       AddStreamFn AddStream)
       : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
                              ModuleToDefinedGVSummaries, AddStream, FileCache(),
-                             nullptr, false, false) {}
+                             /*OnWrite=*/nullptr,
+                             /*ShouldEmitIndexFiles=*/false,
+                             /*ShouldEmitImportsFiles=*/false) {}
 
   virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
@@ -1956,7 +1965,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
   // Create a scratch output to hold intermediate results.
   auto Outputs =
       std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
-  auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+  auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>(
       Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
       ModuleToDefinedGVSummaries, std::move(Outputs));
   // First round: Run optimization and code generation with a scratch output.
@@ -1970,7 +1979,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   // Second round: Run code generation by reading IRs.
   std::unique_ptr<ThinBackendProc> SecondRoundLTO =
-      std::make_unique<OptimizedBitcodeThinBackend>(
+      std::make_unique<SecondRoundThinBackend>(
           Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
           ModuleToDefinedGVSummaries, AddStream);
   Error E = RunBackends(SecondRoundLTO.get());
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index d198e8e5102009..cf69f4add53a79 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -611,7 +611,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
         // Save the current module before the first codegen round.
         // Note that the second codegen round runs only `codegen()` without
         // running `opt()`. We're not reaching here as it's bailed out earlier
-        // with CodeGenOnly which has been set in `OptimizedBitcodeThinBackend`.
+        // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
         if (CodeGenDataThinLTOTwoRounds)
           cgdata::saveModuleForTwoRounds(Mod, Task);
 

>From da550fc19346b076e740cf1447a2c6d43b5dcd5f Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sun, 29 Sep 2024 18:28:15 -0700
Subject: [PATCH 5/7] [NFC] Refactor ThinBackend

 - Change it to a type from a function.
 - Store the parallelism in the type for the future use.
---
 llvm/include/llvm/LTO/LTO.h |  63 +++++++++++++++++++++--
 llvm/lib/LTO/LTO.cpp        | 100 +++++++++++++-----------------------
 2 files changed, 94 insertions(+), 69 deletions(-)

diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 214aa4e1c562dc..fde062ddbf7bc8 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -105,7 +105,41 @@ void updateMemProfAttributes(Module &Mod, const ModuleSummaryIndex &Index);
 
 class LTO;
 struct SymbolResolution;
-class ThinBackendProc;
+
+using IndexWriteCallback = std::function<void(const std::string &)>;
+
+/// This class defines the interface to the ThinLTO backend.
+class ThinBackendProc {
+protected:
+  const Config &Conf;
+  ModuleSummaryIndex &CombinedIndex;
+  const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries;
+  lto::IndexWriteCallback OnWrite;
+  bool ShouldEmitImportsFiles;
+
+public:
+  ThinBackendProc(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles)
+      : Conf(Conf), CombinedIndex(CombinedIndex),
+        ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries),
+        OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles) {}
+
+  virtual ~ThinBackendProc() = default;
+  virtual Error start(
+      unsigned Task, BitcodeModule BM,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
+  virtual Error wait() = 0;
+  virtual unsigned getThreadCount() = 0;
+
+  // Write sharded indices and (optionally) imports to disk
+  Error emitFiles(const FunctionImporter::ImportMapTy &ImportList,
+                  llvm::StringRef ModulePath, const std::string &NewModulePath);
+};
 
 /// An input file. This is a symbol table wrapper that only exposes the
 /// information that an LTO client should need in order to do symbol resolution.
@@ -197,10 +231,30 @@ class InputFile {
 /// A ThinBackend defines what happens after the thin-link phase during ThinLTO.
 /// The details of this type definition aren't important; clients can only
 /// create a ThinBackend using one of the create*ThinBackend() functions below.
-using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
+using ThinBackendFunction = std::function<std::unique_ptr<ThinBackendProc>(
     const Config &C, ModuleSummaryIndex &CombinedIndex,
-    DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+    const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
     AddStreamFn AddStream, FileCache Cache)>;
+struct ThinBackend {
+  ThinBackend(ThinBackendFunction Func, ThreadPoolStrategy Parallelism = {})
+      : Func(std::move(Func)), Parallelism(std::move(Parallelism)) {}
+  ThinBackend() = default;
+
+  std::unique_ptr<ThinBackendProc> operator()(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn AddStream, FileCache Cache) {
+    assert(isValid() && "Invalid backend function");
+    return Func(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
+                std::move(AddStream), std::move(Cache));
+  }
+  ThreadPoolStrategy getParallelism() const { return Parallelism; }
+  bool isValid() const { return static_cast<bool>(Func); }
+
+private:
+  ThinBackendFunction Func = nullptr;
+  ThreadPoolStrategy Parallelism;
+};
 
 /// This ThinBackend runs the individual backend jobs in-process.
 /// The default value means to use one job per hardware core (not hyper-thread).
@@ -210,7 +264,6 @@ using ThinBackend = std::function<std::unique_ptr<ThinBackendProc>(
 /// to the same path as the input module, with suffix ".thinlto.bc"
 /// ShouldEmitImportsFiles is true it also writes a list of imported files to a
 /// similar path with ".imports" appended instead.
-using IndexWriteCallback = std::function<void(const std::string &)>;
 ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism,
                                        IndexWriteCallback OnWrite = nullptr,
                                        bool ShouldEmitIndexFiles = false,
@@ -275,7 +328,7 @@ class LTO {
   /// this constructor.
   /// FIXME: We do currently require the DiagHandler field to be set in Conf.
   /// Until that is fixed, a Config argument is required.
-  LTO(Config Conf, ThinBackend Backend = nullptr,
+  LTO(Config Conf, ThinBackend Backend = {},
       unsigned ParallelCodeGenParallelismLevel = 1,
       LTOKind LTOMode = LTOK_Default);
   ~LTO();
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index b51b908fb28760..8266af5c1d4152 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -581,10 +581,10 @@ LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
   CombinedModule->IsNewDbgInfoFormat = UseNewDbgInfoFormat;
 }
 
-LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
-    : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
-  if (!Backend)
-    this->Backend =
+LTO::ThinLTOState::ThinLTOState(ThinBackend BackendParam)
+    : Backend(std::move(BackendParam)), CombinedIndex(/*HaveGVs*/ false) {
+  if (!Backend.isValid())
+    Backend =
         createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
 }
 
@@ -1371,64 +1371,6 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
   return LibcallSymbols;
 }
 
-/// This class defines the interface to the ThinLTO backend.
-class lto::ThinBackendProc {
-protected:
-  const Config &Conf;
-  ModuleSummaryIndex &CombinedIndex;
-  const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries;
-  lto::IndexWriteCallback OnWrite;
-  bool ShouldEmitImportsFiles;
-
-public:
-  ThinBackendProc(
-      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
-      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      lto::IndexWriteCallback OnWrite, bool ShouldEmitImportsFiles)
-      : Conf(Conf), CombinedIndex(CombinedIndex),
-        ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries),
-        OnWrite(OnWrite), ShouldEmitImportsFiles(ShouldEmitImportsFiles) {}
-
-  virtual ~ThinBackendProc() = default;
-  virtual Error start(
-      unsigned Task, BitcodeModule BM,
-      const FunctionImporter::ImportMapTy &ImportList,
-      const FunctionImporter::ExportSetTy &ExportList,
-      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
-      MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
-  virtual Error wait() = 0;
-  virtual unsigned getThreadCount() = 0;
-
-  // Write sharded indices and (optionally) imports to disk
-  Error emitFiles(const FunctionImporter::ImportMapTy &ImportList,
-                  llvm::StringRef ModulePath,
-                  const std::string &NewModulePath) {
-    ModuleToSummariesForIndexTy ModuleToSummariesForIndex;
-    GVSummaryPtrSet DeclarationSummaries;
-
-    std::error_code EC;
-    gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
-                                     ImportList, ModuleToSummariesForIndex,
-                                     DeclarationSummaries);
-
-    raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
-                      sys::fs::OpenFlags::OF_None);
-    if (EC)
-      return errorCodeToError(EC);
-
-    writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex,
-                     &DeclarationSummaries);
-
-    if (ShouldEmitImportsFiles) {
-      EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
-                            ModuleToSummariesForIndex);
-      if (EC)
-        return errorCodeToError(EC);
-    }
-    return Error::success();
-  }
-};
-
 namespace {
 class InProcessThinBackend : public ThinBackendProc {
   DefaultThreadPool BackendThreadPool;
@@ -1631,7 +1573,7 @@ ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
                                             lto::IndexWriteCallback OnWrite,
                                             bool ShouldEmitIndexFiles,
                                             bool ShouldEmitImportsFiles) {
-  return
+  auto Func =
       [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
           const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
           AddStreamFn AddStream, FileCache Cache) {
@@ -1640,6 +1582,7 @@ ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
             AddStream, Cache, OnWrite, ShouldEmitIndexFiles,
             ShouldEmitImportsFiles);
       };
+  return ThinBackend(Func, Parallelism);
 }
 
 StringLiteral lto::getThinLTODefaultCPU(const Triple &TheTriple) {
@@ -1732,7 +1675,7 @@ ThinBackend lto::createWriteIndexesThinBackend(
     std::string OldPrefix, std::string NewPrefix,
     std::string NativeObjectPrefix, bool ShouldEmitImportsFiles,
     raw_fd_ostream *LinkedObjectsFile, IndexWriteCallback OnWrite) {
-  return
+  auto Func =
       [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
           const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
           AddStreamFn AddStream, FileCache Cache) {
@@ -1741,6 +1684,7 @@ ThinBackend lto::createWriteIndexesThinBackend(
             NewPrefix, NativeObjectPrefix, ShouldEmitImportsFiles,
             LinkedObjectsFile, OnWrite);
       };
+  return ThinBackend(Func);
 }
 
 Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
@@ -2041,3 +1985,31 @@ std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
   });
   return ModulesOrdering;
 }
+
+Error ThinBackendProc::emitFiles(
+    const FunctionImporter::ImportMapTy &ImportList, llvm::StringRef ModulePath,
+    const std::string &NewModulePath) {
+  ModuleToSummariesForIndexTy ModuleToSummariesForIndex;
+  GVSummaryPtrSet DeclarationSummaries;
+
+  std::error_code EC;
+  gatherImportedSummariesForModule(ModulePath, ModuleToDefinedGVSummaries,
+                                   ImportList, ModuleToSummariesForIndex,
+                                   DeclarationSummaries);
+
+  raw_fd_ostream OS(NewModulePath + ".thinlto.bc", EC,
+                    sys::fs::OpenFlags::OF_None);
+  if (EC)
+    return errorCodeToError(EC);
+
+  writeIndexToFile(CombinedIndex, OS, &ModuleToSummariesForIndex,
+                   &DeclarationSummaries);
+
+  if (ShouldEmitImportsFiles) {
+    EC = EmitImportsFiles(ModulePath, NewModulePath + ".imports",
+                          ModuleToSummariesForIndex);
+    if (EC)
+      return errorCodeToError(EC);
+  }
+  return Error::success();
+}

>From 67c68820c8d642902970968c1b8e3154190bce90 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sun, 29 Sep 2024 22:58:52 -0700
Subject: [PATCH 6/7] [NFC] Refactor FileCache

  - Turn it into a type from a function.
  - Store the cache directory for the future use.
---
 llvm/include/llvm/LTO/LTO.h         |  2 +-
 llvm/include/llvm/Support/Caching.h | 22 +++++++++++++++++++++-
 llvm/lib/LTO/LTO.cpp                |  2 +-
 llvm/lib/Support/Caching.cpp        |  5 +++--
 4 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index fde062ddbf7bc8..4b8c4f4fc23298 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -351,7 +351,7 @@ class LTO {
   ///
   /// The client will receive at most one callback (via either AddStream or
   /// Cache) for each task identifier.
-  Error run(AddStreamFn AddStream, FileCache Cache = nullptr);
+  Error run(AddStreamFn AddStream, FileCache Cache = {});
 
   /// Static method that returns a list of libcall symbols that can be generated
   /// by LTO but might not be visible from bitcode symbol table.
diff --git a/llvm/include/llvm/Support/Caching.h b/llvm/include/llvm/Support/Caching.h
index 4fa57cc92e51f7..cc86d1583fd6e6 100644
--- a/llvm/include/llvm/Support/Caching.h
+++ b/llvm/include/llvm/Support/Caching.h
@@ -54,9 +54,29 @@ using AddStreamFn = std::function<Expected<std::unique_ptr<CachedFileStream>>(
 ///
 /// if (AddStreamFn AddStream = Cache(Task, Key, ModuleName))
 ///   ProduceContent(AddStream);
-using FileCache = std::function<Expected<AddStreamFn>(
+using FileCacheFunction = std::function<Expected<AddStreamFn>(
     unsigned Task, StringRef Key, const Twine &ModuleName)>;
 
+struct FileCache {
+  FileCache(FileCacheFunction CacheFn, const std::string &DirectoryPath)
+      : CacheFunction(std::move(CacheFn)), CacheDirectoryPath(DirectoryPath) {}
+  FileCache() = default;
+
+  Expected<AddStreamFn> operator()(unsigned Task, StringRef Key,
+                                   const Twine &ModuleName) {
+    assert(isValid() && "Invalid cache function");
+    return CacheFunction(Task, Key, ModuleName);
+  }
+  const std::string &getCacheDirectoryPath() const {
+    return CacheDirectoryPath;
+  }
+  bool isValid() const { return static_cast<bool>(CacheFunction); }
+
+private:
+  FileCacheFunction CacheFunction = nullptr;
+  std::string CacheDirectoryPath;
+};
+
 /// This type defines the callback to add a pre-existing file (e.g. in a cache).
 ///
 /// Buffer callbacks must be thread safe.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 8266af5c1d4152..c6369ab382eeb0 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1429,7 +1429,7 @@ class InProcessThinBackend : public ThinBackendProc {
         return E;
     }
 
-    if (!Cache || !CombinedIndex.modulePaths().count(ModuleID) ||
+    if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
         all_of(CombinedIndex.getModuleHash(ModuleID),
                [](uint32_t V) { return V == 0; }))
       // Cache disabled or no entry for this module in the combined index or
diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp
index 1ef51db218e89c..66e540efaca972 100644
--- a/llvm/lib/Support/Caching.cpp
+++ b/llvm/lib/Support/Caching.cpp
@@ -37,8 +37,8 @@ Expected<FileCache> llvm::localCache(const Twine &CacheNameRef,
   TempFilePrefixRef.toVector(TempFilePrefix);
   CacheDirectoryPathRef.toVector(CacheDirectoryPath);
 
-  return [=](unsigned Task, StringRef Key,
-             const Twine &ModuleName) -> Expected<AddStreamFn> {
+  auto Func = [=](unsigned Task, StringRef Key,
+                  const Twine &ModuleName) -> Expected<AddStreamFn> {
     // This choice of file name allows the cache to be pruned (see pruneCache()
     // in include/llvm/Support/CachePruning.h).
     SmallString<64> EntryPath;
@@ -167,4 +167,5 @@ Expected<FileCache> llvm::localCache(const Twine &CacheNameRef,
           Task);
     };
   };
+  return FileCache(Func, CacheDirectoryPathRef.str());
 }

>From 69db1d83042cc5fa26ec5b4442e0a89a6a2040e9 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sun, 29 Sep 2024 10:38:46 -0700
Subject: [PATCH 7/7] Address comments from teresajohnson

---
 clang/lib/CodeGen/BackendUtil.cpp             |   9 +-
 llvm/include/llvm/CGData/CodeGenData.h        |  52 ++++-
 llvm/include/llvm/CGData/CodeGenDataReader.h  |   5 +-
 llvm/include/llvm/LTO/LTO.h                   |   6 +-
 llvm/include/llvm/LTO/LTOBackend.h            |   6 +-
 llvm/lib/CGData/CMakeLists.txt                |   2 +
 llvm/lib/CGData/CodeGenData.cpp               |  99 +++++----
 llvm/lib/CGData/CodeGenDataReader.cpp         |   7 +-
 llvm/lib/LTO/LTO.cpp                          | 188 ++++++++++++++----
 llvm/lib/LTO/LTOBackend.cpp                   |   7 +-
 .../AArch64/cgdata-two-rounds-caching.ll      | 173 ++++++++++++++++
 11 files changed, 452 insertions(+), 102 deletions(-)
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index a1909d45b4d944..385ada462666bc 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1286,10 +1286,11 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E = thinBackend(
-          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-          ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
-          /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
+  if (Error E =
+          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+                      /*ModuleMap=*/nullptr, Conf.CodeGenOnly,
+                      /*IRAddStream=*/nullptr, CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 72b52e6e9b8fd1..e8e331f0189ac1 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -15,11 +15,13 @@
 #define LLVM_CGDATA_CODEGENDATA_H
 
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/StableHashing.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/CGData/OutlinedHashTree.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <mutex>
@@ -164,22 +166,60 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
-void initializeTwoCodegenRounds();
+struct StreamCacheData {
+  /// Backing buffer for serialized data stream.
+  SmallVector<SmallString<0>> Outputs;
+  /// Callback function to add serialized data to the stream.
+  AddStreamFn AddStream;
+  /// Backing buffer for cached data.
+  SmallVector<std::unique_ptr<MemoryBuffer>> Files;
+  /// Cache mechanism for storing and retrieving data.
+  FileCache Cache;
+
+  StreamCacheData(unsigned Size) : Outputs(Size), Files(Size) {}
+  StreamCacheData() = delete;
+
+  /// Retrieve results from either the cache or the stream.
+  SmallVector<StringRef> getResult() {
+    unsigned NumOutputs = Outputs.size();
+    SmallVector<StringRef> Result(NumOutputs);
+    for (unsigned I = 0; I < NumOutputs; ++I)
+      if (Files[I])
+        Result[I] = Files[I]->getBuffer();
+      else
+        Result[I] = Outputs[I];
+    return Result;
+  }
+};
+
+/// Establish additional streams and caches for accessing object and IR files.
+/// \p OrigCache refers to the original cache used for accessing the final
+/// object files, which has already been configured and provided by the linker,
+/// if applicable. This cache will be utilized during the second round of the
+/// run. Additionally, we add two more caches at the same location for the first
+/// round of the run.
+void initializeTwoCodegenRounds(StreamCacheData &CG, StreamCacheData &IR,
+                                const FileCache &OrigCache);
 
 /// Save \p TheModule before the first codegen round.
 /// \p Task represents the partition number in the parallel code generation
 /// process.
-void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
+/// \p AddStream is the callback used to add the serialized module to the
+/// stream.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task,
+                            AddStreamFn AddStream);
 
 /// Load the optimized module before the second codegen round.
 std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
                                                unsigned Task,
-                                               LLVMContext &Context);
+                                               LLVMContext &Context,
+                                               ArrayRef<StringRef> IRFiles);
 
 /// Merge the codegen data from the input files in scratch vector in ThinLTO
-/// two-codegen rounds.
-Error mergeCodeGenData(
-    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
+/// two-codegen rounds. Optionally, \p CombinedHash can be used to compuate
+/// the combined hash of the merged data.
+Error mergeCodeGenData(ArrayRef<StringRef> CGFiles,
+                       stable_hash *CombinedHash = nullptr);
 
 void warn(Error E, StringRef Whence = "");
 void warn(Twine Message, std::string Whence = "", std::string Hint = "");
diff --git a/llvm/include/llvm/CGData/CodeGenDataReader.h b/llvm/include/llvm/CGData/CodeGenDataReader.h
index 1ee4bfbe480233..7e4882df2116e2 100644
--- a/llvm/include/llvm/CGData/CodeGenDataReader.h
+++ b/llvm/include/llvm/CGData/CodeGenDataReader.h
@@ -54,8 +54,11 @@ class CodeGenDataReader {
   /// Extract the cgdata embedded in sections from the given object file and
   /// merge them into the GlobalOutlineRecord. This is a static helper that
   /// is used by `llvm-cgdata --merge` or ThinLTO's two-codegen rounds.
+  /// Optionally, \p CombinedHash can be used to compuate the combined hash of
+  /// the merged data.
   static Error mergeFromObjectFile(const object::ObjectFile *Obj,
-                                   OutlinedHashTreeRecord &GlobalOutlineRecord);
+                                   OutlinedHashTreeRecord &GlobalOutlineRecord,
+                                   stable_hash *CombinedHash = nullptr);
 
 protected:
   /// The outlined hash tree that has been read. When it's released by
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 4b8c4f4fc23298..7174118ed81fe1 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -64,7 +64,8 @@ void thinLTOInternalizeAndPromoteInIndex(
         isPrevailing);
 
 /// Computes a unique hash for the Module considering the current list of
-/// export/import and other global analysis results.
+/// export/import and other global analysis results. Optionally, \p ExtraID
+/// can be used to add an extra identifier to the hash.
 std::string computeLTOCacheKey(
     const lto::Config &Conf, const ModuleSummaryIndex &Index,
     StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList,
@@ -72,7 +73,8 @@ std::string computeLTOCacheKey(
     const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
     const GVSummaryMapTy &DefinedGlobals,
     const DenseSet<GlobalValue::GUID> &CfiFunctionDefs = {},
-    const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {});
+    const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {},
+    StringRef ExtraID = {});
 
 namespace lto {
 
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index 098c0491dfe70a..2769e58f249053 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -51,13 +51,15 @@ Error backend(const Config &C, AddStreamFn AddStream,
 /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will
 /// be mapped to memory on demand and at any given time during importing, only
 /// one source module will be kept open at the most. If \p CodeGenOnly is true,
-/// the backend will skip optimization and only perform code generation.
+/// the backend will skip optimization and only perform code generation. If
+/// \p IRAddStream is not nullptr, it will be called just before code generation
+/// to serialize the optimized IR.
 Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   Module &M, const ModuleSummaryIndex &CombinedIndex,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> *ModuleMap,
-                  bool CodeGenOnly,
+                  bool CodeGenOnly, AddStreamFn IRAddStream = nullptr,
                   const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(
diff --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt
index ff1aab920e7a8c..157b0dfb7f9fcf 100644
--- a/llvm/lib/CGData/CMakeLists.txt
+++ b/llvm/lib/CGData/CMakeLists.txt
@@ -12,6 +12,8 @@ add_llvm_component_library(LLVMCGData
   intrinsics_gen
 
   LINK_COMPONENTS
+  BitReader
+  BitWriter
   Core
   Support
   Object
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 4e21045a67cba6..460f01aa3b1e98 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -15,6 +15,7 @@
 #include "llvm/CGData/CodeGenDataReader.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
@@ -37,9 +38,6 @@ cl::opt<bool> CodeGenDataThinLTOTwoRounds(
              "emits codegen data, while the second round uses the emitted "
              "codegen data for further optimizations."));
 
-// Path to where the optimized bitcodes are saved and restored for ThinLTO.
-static SmallString<128> CodeGenDataThinLTOTwoRoundsPath;
-
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
   std::string Msg;
@@ -224,59 +222,78 @@ void warn(Error E, StringRef Whence) {
   }
 }
 
-static std::string getPath(StringRef Dir, unsigned Task) {
-  llvm::SmallString<128> Path(Dir);
-  llvm::sys::path::append(Path, llvm::Twine(Task) + ".saved_copy.bc");
-  return std::string(Path);
-}
-
-void initializeTwoCodegenRounds() {
+void initializeTwoCodegenRounds(StreamCacheData &CG, StreamCacheData &IR,
+                                const FileCache &OrigCache) {
   assert(CodeGenDataThinLTOTwoRounds);
-  if (auto EC = llvm::sys::fs::createUniqueDirectory(
-          "cgdata", CodeGenDataThinLTOTwoRoundsPath))
-    report_fatal_error(Twine("Failed to create directory: ") + EC.message());
+  CG.AddStream = [&](size_t Task, const Twine &ModuleName) {
+    return std::make_unique<CachedFileStream>(
+        std::make_unique<raw_svector_ostream>(CG.Outputs[Task]));
+  };
+  IR.AddStream = [&](size_t Task, const Twine &ModuleName) {
+    return std::make_unique<CachedFileStream>(
+        std::make_unique<raw_svector_ostream>(IR.Outputs[Task]));
+  };
+
+  if (OrigCache.isValid()) {
+    auto CGCacheOrErr =
+        localCache("ThinLTO", "CG", OrigCache.getCacheDirectoryPath(),
+                   [&](size_t Task, const Twine &ModuleName,
+                       std::unique_ptr<MemoryBuffer> MB) {
+                     CG.Files[Task] = std::move(MB);
+                   });
+    if (Error Err = CGCacheOrErr.takeError())
+      report_fatal_error(std::move(Err));
+    CG.Cache = std::move(*CGCacheOrErr);
+    auto IRCacheOrErr =
+        localCache("ThinLTO", "IR", OrigCache.getCacheDirectoryPath(),
+                   [&](size_t Task, const Twine &NoduleName,
+                       std::unique_ptr<MemoryBuffer> MB) {
+                     IR.Files[Task] = std::move(MB);
+                   });
+    if (Error Err = IRCacheOrErr.takeError())
+      report_fatal_error(std::move(Err));
+    IR.Cache = std::move(*IRCacheOrErr);
+  }
 }
 
-void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
-  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
-  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
-  std::error_code EC;
-  raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
-  if (EC)
-    report_fatal_error(Twine("Failed to open ") + Path +
-                       " to save optimized bitcode: " + EC.message());
-  WriteBitcodeToFile(TheModule, OS, /*ShouldPreserveUseListOrder=*/true);
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task,
+                            AddStreamFn AddStream) {
+  LLVM_DEBUG(dbgs() << "Saving module: " << TheModule.getModuleIdentifier()
+                    << " in Task " << Task << "\n");
+  Expected<std::unique_ptr<CachedFileStream>> StreamOrErr =
+      AddStream(Task, TheModule.getModuleIdentifier());
+  if (Error Err = StreamOrErr.takeError())
+    report_fatal_error(std::move(Err));
+  std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr;
+
+  WriteBitcodeToFile(TheModule, *Stream->OS,
+                     /*ShouldPreserveUseListOrder=*/true);
 }
 
 std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
                                                unsigned Task,
-                                               LLVMContext &Context) {
-  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
-  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
-  auto FileOrError = MemoryBuffer::getFile(Path);
-  if (auto EC = FileOrError.getError())
-    report_fatal_error(Twine("Failed to open ") + Path +
-                       " to load optimized bitcode: " + EC.message());
-
-  std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+                                               LLVMContext &Context,
+                                               ArrayRef<StringRef> IRFiles) {
+  LLVM_DEBUG(dbgs() << "Loading module: " << OrigModule.getModuleIdentifier()
+                    << " in Task " << Task << "\n");
+  std::unique_ptr<MemoryBuffer> FileBuffer = MemoryBuffer::getMemBuffer(
+      IRFiles[Task], "in-memory IR file", /*RequiresNullTerminator=*/false);
   auto RestoredModule = parseBitcodeFile(*FileBuffer, Context);
   if (!RestoredModule)
-    report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
-                       Path + "\n");
+    report_fatal_error(
+        Twine("Failed to parse optimized bitcode loaded for Task: ") +
+        Twine(Task) + "\n");
 
   // Restore the original module identifier.
   (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
   return std::move(*RestoredModule);
 }
 
-Error mergeCodeGenData(
-    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
-
+Error mergeCodeGenData(ArrayRef<StringRef> CGFiles, stable_hash *CombinedHash) {
   OutlinedHashTreeRecord GlobalOutlineRecord;
-  for (auto &InputFile : *(InputFiles)) {
-    if (InputFile.empty())
+  for (auto File : CGFiles) {
+    if (File.empty())
       continue;
-    StringRef File = StringRef(InputFile.data(), InputFile.size());
     std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
         File, "in-memory object file", /*RequiresNullTerminator=*/false);
     Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
@@ -285,8 +302,8 @@ Error mergeCodeGenData(
       return BinOrErr.takeError();
 
     std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
-    if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
-                                                        GlobalOutlineRecord))
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(
+            Obj.get(), GlobalOutlineRecord, CombinedHash))
       return E;
   }
 
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index f7f3a8f42af7e1..2f2481ea60f822 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -31,8 +31,8 @@ setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
 }
 
 Error CodeGenDataReader::mergeFromObjectFile(
-    const object::ObjectFile *Obj,
-    OutlinedHashTreeRecord &GlobalOutlineRecord) {
+    const object::ObjectFile *Obj, OutlinedHashTreeRecord &GlobalOutlineRecord,
+    stable_hash *CombinedHash) {
   Triple TT = Obj->makeTriple();
   auto CGOutLineName =
       getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false);
@@ -48,6 +48,9 @@ Error CodeGenDataReader::mergeFromObjectFile(
     auto *EndData = Data + ContentsOrErr->size();
 
     if (*NameOrErr == CGOutLineName) {
+      if (CombinedHash)
+        *CombinedHash =
+            stable_hash_combine(*CombinedHash, xxh3_64bits(*ContentsOrErr));
       // In case dealing with an executable that has concatenated cgdata,
       // we want to merge them into a single cgdata.
       // Although it's not a typical workflow, we support this scenario.
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index c6369ab382eeb0..493f0be5938658 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -13,6 +13,7 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StableHashing.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -36,6 +37,7 @@
 #include "llvm/Linker/IRMover.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/IRObjectFile.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -101,7 +103,7 @@ std::string llvm::computeLTOCacheKey(
     const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
     const GVSummaryMapTy &DefinedGlobals,
     const DenseSet<GlobalValue::GUID> &CfiFunctionDefs,
-    const DenseSet<GlobalValue::GUID> &CfiFunctionDecls) {
+    const DenseSet<GlobalValue::GUID> &CfiFunctionDecls, StringRef ExtraID) {
   // Compute the unique hash for this entry.
   // This is based on the current compiler version, the module itself, the
   // export list, the hash for every single module in the import list, the
@@ -341,6 +343,9 @@ std::string llvm::computeLTOCacheKey(
     }
   }
 
+  if (!ExtraID.empty())
+    AddString(ExtraID);
+
   return toHex(Hasher.result());
 }
 
@@ -1373,6 +1378,7 @@ SmallVector<const char *> LTO::getRuntimeLibcallSymbols(const Triple &TT) {
 
 namespace {
 class InProcessThinBackend : public ThinBackendProc {
+protected:
   DefaultThreadPool BackendThreadPool;
   AddStreamFn AddStream;
   FileCache Cache;
@@ -1511,25 +1517,89 @@ class InProcessThinBackend : public ThinBackendProc {
 /// buffer. Note the codegen data stored in the scratch buffer will be extracted
 /// and merged in the subsequent step.
 class FirstRoundThinBackend : public InProcessThinBackend {
+  AddStreamFn IRAddStream;
+  FileCache IRCache;
+
 public:
   FirstRoundThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
-      : InProcessThinBackend(
-            Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
-            // Allocate a scratch buffer for each task to write output to.
-            [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
-              return std::make_unique<CachedFileStream>(
-                  std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
-            },
-            FileCache(), /*OnWrite=*/nullptr, /*ShouldEmitIndexFiles=*/false,
-            /*ShouldEmitImportsFiles=*/false),
-        Scratch(std::move(Scratch)) {}
-
-  /// Scratch space for writing output during the codegen.
-  std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
+      AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream,
+      FileCache IRCache)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, std::move(CGAddStream),
+                             std::move(CGCache), /*OnWrite=*/nullptr,
+                             /*ShouldEmitIndexFiles=*/false,
+                             /*ShouldEmitImportsFiles=*/false),
+        IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {}
+
+  Error runThinLTOBackendThread(
+      AddStreamFn CGAddStream, FileCache CGCache, unsigned Task,
+      BitcodeModule BM, ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    auto RunThinBackend = [&](AddStreamFn CGAddStream,
+                              AddStreamFn IRAddStream) {
+      LTOLLVMContext BackendContext(Conf);
+      Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
+      if (!MOrErr)
+        return MOrErr.takeError();
+
+      return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex,
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         Conf.CodeGenOnly, IRAddStream);
+    };
+
+    auto ModuleID = BM.getModuleIdentifier();
+
+    if (ShouldEmitIndexFiles) {
+      if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str()))
+        return E;
+    }
+
+    assert((CGCache.isValid() == IRCache.isValid()) &&
+           "Both caches for CG and IR should have matching availability");
+    if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
+        all_of(CombinedIndex.getModuleHash(ModuleID),
+               [](uint32_t V) { return V == 0; }))
+      // Cache disabled or no entry for this module in the combined index or
+      // no module hash.
+      return RunThinBackend(CGAddStream, IRAddStream);
+
+    // Get CGKey for caching object in CGCache.
+    std::string CGKey = computeLTOCacheKey(
+        Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
+        DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls);
+    Expected<AddStreamFn> CacheCGAddStreamOrErr =
+        CGCache(Task, CGKey, ModuleID);
+    if (Error Err = CacheCGAddStreamOrErr.takeError())
+      return Err;
+    AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr;
+
+    // Get IRKey for caching (optimized) IR in IRCache.
+    std::string IRKey = computeLTOCacheKey(
+        Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
+        DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls, /*ExtraID=*/"IR");
+    Expected<AddStreamFn> CacheIRAddStreamOrErr =
+        IRCache(Task, IRKey, ModuleID);
+    if (Error Err = CacheIRAddStreamOrErr.takeError())
+      return Err;
+    AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr;
+
+    assert((CacheCGAddStream == nullptr) == (CacheIRAddStream == nullptr) &&
+           "Both CG and IR caching should be matched");
+    if (CacheIRAddStream) {
+      LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for "
+                        << BM.getModuleIdentifier() << "\n");
+      return RunThinBackend(CacheCGAddStream, CacheIRAddStream);
+    }
+
+    return Error::success();
+  }
 };
 
 /// This backend operates in the second round of a two-codegen round process.
@@ -1538,17 +1608,23 @@ class FirstRoundThinBackend : public InProcessThinBackend {
 /// the code, utilizing the codegen data merged from the first round. Finally,
 /// it writes the resulting object files as usual.
 class SecondRoundThinBackend : public InProcessThinBackend {
+  ArrayRef<StringRef> IRFiles;
+  stable_hash CombinedCGDataHash;
+
 public:
   SecondRoundThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
       ThreadPoolStrategy ThinLTOParallelism,
       const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
-      AddStreamFn AddStream)
+      AddStreamFn AddStream, FileCache CGCache, ArrayRef<StringRef> IRFiles,
+      stable_hash CombinedCGDataHash)
       : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
-                             ModuleToDefinedGVSummaries, AddStream, FileCache(),
+                             ModuleToDefinedGVSummaries, AddStream,
+                             std::move(CGCache),
                              /*OnWrite=*/nullptr,
                              /*ShouldEmitIndexFiles=*/false,
-                             /*ShouldEmitImportsFiles=*/false) {}
+                             /*ShouldEmitImportsFiles=*/false),
+        IRFiles(IRFiles), CombinedCGDataHash(CombinedCGDataHash) {}
 
   virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
@@ -1558,13 +1634,42 @@ class SecondRoundThinBackend : public InProcessThinBackend {
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedGlobals,
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
-    LTOLLVMContext BackendContext(Conf);
-    std::unique_ptr<Module> LoadedModule =
-        cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
+    auto RunThinBackend = [&](AddStreamFn AddStream) {
+      LTOLLVMContext BackendContext(Conf);
+      std::unique_ptr<Module> LoadedModule =
+          cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, IRFiles);
 
-    return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
-                       ImportList, DefinedGlobals, &ModuleMap,
-                       /*CodeGenOnly=*/true);
+      return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         /*CodeGenOnly=*/true);
+    };
+
+    auto ModuleID = BM.getModuleIdentifier();
+    if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
+        all_of(CombinedIndex.getModuleHash(ModuleID),
+               [](uint32_t V) { return V == 0; }))
+      // Cache disabled or no entry for this module in the combined index or
+      // no module hash.
+      return RunThinBackend(AddStream);
+
+    // Get Key for caching the final object file in Cache with the combined
+    // CGData hash.
+    std::string Key = computeLTOCacheKey(
+        Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
+        DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls,
+        /*ExtraID=*/std::to_string(CombinedCGDataHash));
+    Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID);
+    if (Error Err = CacheAddStreamOrErr.takeError())
+      return Err;
+    AddStreamFn &CacheAddStream = *CacheAddStreamOrErr;
+
+    if (CacheAddStream) {
+      LLVM_DEBUG(dbgs() << "[SecondRound] Cache Miss for "
+                        << BM.getModuleIdentifier() << "\n");
+      return RunThinBackend(CacheAddStream);
+    }
+
+    return Error::success();
   }
 };
 } // end anonymous namespace
@@ -1900,32 +2005,33 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
   // 1. First round: Run optimization and code generation with a scratch output.
   // 2. Merge codegen data extracted from the scratch output.
   // 3. Second round: Run code generation again using the merged data.
-  LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Initializing ThinLTO two-codegen rounds\n");
 
-  // Initialize a temporary path to store and retrieve optimized IRs for
-  // two-round code generation.
-  cgdata::initializeTwoCodegenRounds();
+  unsigned MaxTasks = getMaxTasks();
+  auto Parallelism = ThinLTO.Backend.getParallelism();
+  cgdata::StreamCacheData CG(MaxTasks), IR(MaxTasks);
+  cgdata::initializeTwoCodegenRounds(CG, IR, Cache);
 
-  // Create a scratch output to hold intermediate results.
-  auto Outputs =
-      std::make_unique<std::vector<llvm::SmallString<0>>>(getMaxTasks());
-  auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>(
-      Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
-      ModuleToDefinedGVSummaries, std::move(Outputs));
   // First round: Run optimization and code generation with a scratch output.
-  // Before code generation, serialize modules.
+  // Before code generation, serialize the optimized IR modules.
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Running the first round of codegen\n");
+  auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>(
+      Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
+      CG.AddStream, CG.Cache, IR.AddStream, IR.Cache);
   if (Error E = RunBackends(FirstRoundLTO.get()))
     return E;
 
-  // Merge codegen data extracted from the scratch output.
-  if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Merging codegen data\n");
+  stable_hash CombinedHash = 0;
+  if (Error E = cgdata::mergeCodeGenData(CG.getResult(), &CombinedHash))
     return E;
+  LLVM_DEBUG(dbgs() << "[TwoRounds] CGData hash: " << CombinedHash << "\n");
 
   // Second round: Run code generation by reading IRs.
-  std::unique_ptr<ThinBackendProc> SecondRoundLTO =
-      std::make_unique<SecondRoundThinBackend>(
-          Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
-          ModuleToDefinedGVSummaries, AddStream);
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Running the second round of codegen\n");
+  auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>(
+      Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
+      AddStream, Cache, IR.getResult(), CombinedHash);
   Error E = RunBackends(SecondRoundLTO.get());
 
   return E;
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index cf69f4add53a79..e414411ed64a01 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -568,7 +568,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
-                       bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
+                       bool CodeGenOnly, AddStreamFn IRAddStream,
+                       const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -612,8 +613,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
         // Note that the second codegen round runs only `codegen()` without
         // running `opt()`. We're not reaching here as it's bailed out earlier
         // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
-        if (CodeGenDataThinLTOTwoRounds)
-          cgdata::saveModuleForTwoRounds(Mod, Task);
+        if (IRAddStream)
+          cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
 
         codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll
new file mode 100644
index 00000000000000..61131ad6d3887f
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll
@@ -0,0 +1,173 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; by running two codegen rounds.
+; This test also verifies if caches for the two-round codegens are correctly working.
+
+; REQUIRES: asserts
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+
+; 0. Base case without a cache.
+; Verify each outlining instance is singleton with the global outlining for thinlto.
+; They will be identical, which can be folded by the linker with ICF.
+; RUN: opt -module-hash -module-summary %t/thin-one.ll -o %t/thin-one.bc
+; RUN: opt -module-hash -module-summary %t/thin-two.ll -o %t/thin-two.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1
+; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  b
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+
+; 1. Run this with a cache for the first time.
+; RUN: rm -rf %t.cache
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-cold \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-cold.txt 2>&1
+; RUN: cat %t.log-cold.txt | FileCheck %s --check-prefix=COLD
+; diff %t/thinlto.1 %t/thinlto-cold.1
+; diff %t/thinlto.2 %t/thinlto-cold.2
+
+; COLD: [FirstRound] Cache Miss for {{.*}}thin-one.bc
+; COLD: [FirstRound] Cache Miss for {{.*}}thin-two.bc
+; COLD: [SecondRound] Cache Miss for {{.*}}thin-one.bc
+; COLD: [SecondRound] Cache Miss for {{.*}}thin-two.bc
+
+; 2. Without any changes, simply re-running it will hit the cache.
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm.txt 2>&1
+; RUN: cat %t.log-warm.txt | FileCheck %s --check-prefix=WARM
+; diff %t/thinlto.1 %t/thinlto-warm.1
+; diff %t/thinlto.2 %t/thinlto-warm.2
+
+; WARM-NOT: Cache Miss
+
+; 3. Assume thin-one.ll is modified to mimic thin-one-modified.ll
+; The merged CG data remains unchanged as this modification does not affect the hash tree built from thin-two.bc.
+; Therefore, both the first and second round runs update only this module.
+; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified.txt 2>&1
+; RUN: cat %t.log-warm-modified.txt | FileCheck %s --check-prefix=WARM-MODIFIED
+; diff %t/thinlto.1 %t/thinlto-warm-modified.1
+; diff %t/thinlto.2 %t/thinlto-warm-modified.2
+
+; WARM-MODIFIED: [FirstRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-NOT: [FirstRound] Cache Miss for {{.*}}thin-two.bc
+; WARM-MODIFIED: [SecondRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-NOT: [SecondRound] Cache Miss for {{.*}}thin-two.bc
+
+; 4. Additionally, thin-two.ll is modified to mimic thin-two-modified.ll.
+; In this case, the merged CG data, which is global, is updated.
+; Although the first round run updates only the thin-two.ll module, the second round run
+; will update all modules, resulting in different binaries.
+; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc
+; RUN: opt -module-hash -module-summary %t/thin-two-modified.ll -o %t/thin-two.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified-all \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified-all.txt 2>&1
+; RUN: cat %t.log-warm-modified-all.txt | FileCheck %s --check-prefix=WARM-MODIFIED-ALL
+; RUN: not diff %t/thinlto.1 %t/thinlto-warm-modified-all.1
+; RUN: not diff %t/thinlto.2 %t/thinlto-warm-modified-all.2
+
+; WARM-MODIFIED-ALL-NOT: [FirstRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-ALL: [FirstRound] Cache Miss for {{.*}}thin-two.bc
+; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-two.bc
+
+; thin-one-modified.ll won't be outlined.
+; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.1 | FileCheck %s --check-prefix=THINLTO-1-MODIFIED-ALL
+; THINLTO-1-MODIFIED-ALL-NOT: _OUTLINED_FUNCTION{{.*}}>:
+
+; thin-two-modified.ll will have two (longer) outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.2| FileCheck %s --check-prefix=THINLTO-2-MODIFIED-ALL
+; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  b
+; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  b
+
+; 5. Re-running it will hit the cache.
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-again \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-again.txt 2>&1
+; RUN: cat %t.log-warm-again.txt | FileCheck %s --check-prefix=WARM-AGAIN
+; RUN: diff %t/thinlto-warm-modified-all.1 %t/thinlto-warm-again.1
+; RUN: diff %t/thinlto-warm-modified-all.2 %t/thinlto-warm-again.2
+
+; WARM-AGAIN-NOT: Cache Miss
+
+;--- thin-one.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-one-modified.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 31, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- thin-two-modified.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}



More information about the cfe-commits mailing list