[clang] dc85d52 - [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds (#90933)

Wed Oct 9 15:37:44 PDT 2024

Author: Kyungwoo Lee
Date: 2024-10-09T15:37:41-07:00
New Revision: dc85d5263ed5e416cb4ddf405611472f4ef12fd3

URL: https://github.com/llvm/llvm-project/commit/dc85d5263ed5e416cb4ddf405611472f4ef12fd3
DIFF: https://github.com/llvm/llvm-project/commit/dc85d5263ed5e416cb4ddf405611472f4ef12fd3.diff

LOG: [CGData][ThinLTO] Global Outlining with Two-CodeGen Rounds (#90933)

This feature is enabled by `-codegen-data-thinlto-two-rounds`, which
effectively runs the `-codegen-data-generate` and `-codegen-data-use` in
two rounds to enable global outlining with ThinLTO.
1. The first round: Run both optimization + codegen with a scratch
output.
Before running codegen, we serialize the optimized bitcode modules to a
temporary path.
 2. From the scratch object files, we merge them into the codegen data.
3. The second round: Read the optimized bitcode modules and start the
codegen only this time.
Using the codegen data, the machine outliner effectively performs the
global outlining.
 
Depends on #90934, #110461 and  #110463.
This is a patch for
https://discourse.llvm.org/t/rfc-enhanced-machine-outliner-part-2-thinlto-nolto/78753.

Added: 
    llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll
    llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
    llvm/test/ThinLTO/AArch64/lit.local.cfg

Modified: 
    clang/lib/CodeGen/BackendUtil.cpp
    llvm/include/llvm/CGData/CodeGenData.h
    llvm/include/llvm/CGData/CodeGenDataReader.h
    llvm/include/llvm/LTO/LTO.h
    llvm/include/llvm/LTO/LTOBackend.h
    llvm/lib/CGData/CMakeLists.txt
    llvm/lib/CGData/CodeGenData.cpp
    llvm/lib/CGData/CodeGenDataReader.cpp
    llvm/lib/LTO/CMakeLists.txt
    llvm/lib/LTO/LTO.cpp
    llvm/lib/LTO/LTOBackend.cpp

Removed: 
    


################################################################################
diff  --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index abc936f2c686dd..f018130807519d 100644

--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1321,10 +1321,11 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E = thinBackend(
-          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-          ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
-          /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
+  if (Error E =
+          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+                      /*ModuleMap=*/nullptr, Conf.CodeGenOnly,
+                      /*IRAddStream=*/nullptr, CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });

diff  --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 84133a433170fe..53550beeae1f83 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -15,11 +15,13 @@
 #define LLVM_CGDATA_CODEGENDATA_H
 
 #include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/ADT/StableHashing.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/CGData/OutlinedHashTree.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/Triple.h"
 #include <mutex>
@@ -164,6 +166,74 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
+struct StreamCacheData {
+  /// Backing buffer for serialized data stream.
+  SmallVector<SmallString<0>> Outputs;
+  /// Callback function to add serialized data to the stream.
+  AddStreamFn AddStream;
+  /// Backing buffer for cached data.
+  SmallVector<std::unique_ptr<MemoryBuffer>> Files;
+  /// Cache mechanism for storing data.
+  FileCache Cache;
+
+  StreamCacheData(unsigned Size, const FileCache &OrigCache,
+                  const Twine &CachePrefix)
+      : Outputs(Size), Files(Size) {
+    AddStream = [&](size_t Task, const Twine &ModuleName) {
+      return std::make_unique<CachedFileStream>(
+          std::make_unique<raw_svector_ostream>(Outputs[Task]));
+    };
+
+    if (OrigCache.isValid()) {
+      auto CGCacheOrErr =
+          localCache("ThinLTO", CachePrefix, OrigCache.getCacheDirectoryPath(),
+                     [&](size_t Task, const Twine &ModuleName,
+                         std::unique_ptr<MemoryBuffer> MB) {
+                       Files[Task] = std::move(MB);
+                     });
+      if (Error Err = CGCacheOrErr.takeError())
+        report_fatal_error(std::move(Err));
+      Cache = std::move(*CGCacheOrErr);
+    }
+  }
+  StreamCacheData() = delete;
+
+  /// Retrieve results from either the cache or the stream.
+  std::unique_ptr<SmallVector<StringRef>> getResult() {
+    unsigned NumOutputs = Outputs.size();
+    auto Result = std::make_unique<SmallVector<StringRef>>(NumOutputs);
+    for (unsigned I = 0; I < NumOutputs; ++I)
+      if (Files[I])
+        (*Result)[I] = Files[I]->getBuffer();
+      else
+        (*Result)[I] = Outputs[I];
+    return Result;
+  }
+};
+
+/// Save \p TheModule before the first codegen round.
+/// \p Task represents the partition number in the parallel code generation
+/// process. \p AddStream is the callback used to add the serialized module to
+/// the stream.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task,
+                            AddStreamFn AddStream);
+
+/// Load the optimized bitcode module for the second codegen round.
+/// \p OrigModule is the original bitcode module.
+/// \p Task identifies the partition number in the parallel code generation
+/// process. \p Context provides the environment settings for module operations.
+/// \p IRFiles contains optimized bitcode module files needed for loading.
+/// \return A unique_ptr to the loaded Module, or nullptr if loading fails.
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context,
+                                               ArrayRef<StringRef> IRFiles);
+
+/// Merge the codegen data from the scratch objects \p ObjectFiles from the
+/// first codegen round.
+/// \return the combined hash of the merged codegen data.
+Expected<stable_hash> mergeCodeGenData(ArrayRef<StringRef> ObjectFiles);
+
 void warn(Error E, StringRef Whence = "");
 void warn(Twine Message, std::string Whence = "", std::string Hint = "");
 

diff  --git a/llvm/include/llvm/CGData/CodeGenDataReader.h b/llvm/include/llvm/CGData/CodeGenDataReader.h
index 1ee4bfbe480233..7e4882df2116e2 100644
--- a/llvm/include/llvm/CGData/CodeGenDataReader.h
+++ b/llvm/include/llvm/CGData/CodeGenDataReader.h
@@ -54,8 +54,11 @@ class CodeGenDataReader {
   /// Extract the cgdata embedded in sections from the given object file and
   /// merge them into the GlobalOutlineRecord. This is a static helper that
   /// is used by `llvm-cgdata --merge` or ThinLTO's two-codegen rounds.
+  /// Optionally, \p CombinedHash can be used to compuate the combined hash of
+  /// the merged data.
   static Error mergeFromObjectFile(const object::ObjectFile *Obj,
-                                   OutlinedHashTreeRecord &GlobalOutlineRecord);
+                                   OutlinedHashTreeRecord &GlobalOutlineRecord,
+                                   stable_hash *CombinedHash = nullptr);
 
 protected:
   /// The outlined hash tree that has been read. When it's released by

diff  --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 66d8ca63a206f6..242a05f7d32c02 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -75,6 +75,9 @@ std::string computeLTOCacheKey(
     const DenseSet<GlobalValue::GUID> &CfiFunctionDefs = {},
     const DenseSet<GlobalValue::GUID> &CfiFunctionDecls = {});
 
+/// Recomputes the LTO cache key for a given key with an extra identifier.
+std::string recomputeLTOCacheKey(const std::string &Key, StringRef ExtraID);
+
 namespace lto {
 
 StringLiteral getThinLTODefaultCPU(const Triple &TheTriple);

diff  --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index 098c0491dfe70a..2769e58f249053 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -51,13 +51,15 @@ Error backend(const Config &C, AddStreamFn AddStream,
 /// are saved in the ModuleMap. If \p ModuleMap is nullptr, module files will
 /// be mapped to memory on demand and at any given time during importing, only
 /// one source module will be kept open at the most. If \p CodeGenOnly is true,
-/// the backend will skip optimization and only perform code generation.
+/// the backend will skip optimization and only perform code generation. If
+/// \p IRAddStream is not nullptr, it will be called just before code generation
+/// to serialize the optimized IR.
 Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   Module &M, const ModuleSummaryIndex &CombinedIndex,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> *ModuleMap,
-                  bool CodeGenOnly,
+                  bool CodeGenOnly, AddStreamFn IRAddStream = nullptr,
                   const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(

diff  --git a/llvm/lib/CGData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt
index ff1aab920e7a8c..157b0dfb7f9fcf 100644
--- a/llvm/lib/CGData/CMakeLists.txt
+++ b/llvm/lib/CGData/CMakeLists.txt
@@ -12,6 +12,8 @@ add_llvm_component_library(LLVMCGData
   intrinsics_gen
 
   LINK_COMPONENTS
+  BitReader
+  BitWriter
   Core
   Support
   Object

diff  --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 55d2504231c744..c56a8b77a52319 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -15,8 +15,10 @@
 #include "llvm/CGData/CodeGenDataReader.h"
 #include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/WithColor.h"
 
 #define DEBUG_TYPE "cg-data"
@@ -30,6 +32,11 @@ cl::opt<bool>
 cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+cl::opt<bool> CodeGenDataThinLTOTwoRounds(
+    "codegen-data-thinlto-two-rounds", cl::init(false), cl::Hidden,
+    cl::desc("Enable two-round ThinLTO code generation. The first round "
+             "emits codegen data, while the second round uses the emitted "
+             "codegen data for further optimizations."));
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
@@ -139,7 +146,7 @@ CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
     Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
-    if (CodeGenDataGenerate)
+    if (CodeGenDataGenerate || CodeGenDataThinLTOTwoRounds)
       Instance->EmitCGData = true;
     else if (!CodeGenDataUsePath.empty()) {
       // Initialize the global CGData if the input file name is given.
@@ -215,6 +222,64 @@ void warn(Error E, StringRef Whence) {
   }
 }
 
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task,
+                            AddStreamFn AddStream) {
+  LLVM_DEBUG(dbgs() << "Saving module: " << TheModule.getModuleIdentifier()
+                    << " in Task " << Task << "\n");
+  Expected<std::unique_ptr<CachedFileStream>> StreamOrErr =
+      AddStream(Task, TheModule.getModuleIdentifier());
+  if (Error Err = StreamOrErr.takeError())
+    report_fatal_error(std::move(Err));
+  std::unique_ptr<CachedFileStream> &Stream = *StreamOrErr;
+
+  WriteBitcodeToFile(TheModule, *Stream->OS,
+                     /*ShouldPreserveUseListOrder=*/true);
+}
+
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context,
+                                               ArrayRef<StringRef> IRFiles) {
+  LLVM_DEBUG(dbgs() << "Loading module: " << OrigModule.getModuleIdentifier()
+                    << " in Task " << Task << "\n");
+  auto FileBuffer = MemoryBuffer::getMemBuffer(
+      IRFiles[Task], "in-memory IR file", /*RequiresNullTerminator=*/false);
+  auto RestoredModule = parseBitcodeFile(*FileBuffer, Context);
+  if (!RestoredModule)
+    report_fatal_error(
+        Twine("Failed to parse optimized bitcode loaded for Task: ") +
+        Twine(Task) + "\n");
+
+  // Restore the original module identifier.
+  (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
+  return std::move(*RestoredModule);
+}
+
+Expected<stable_hash> mergeCodeGenData(ArrayRef<StringRef> ObjFiles) {
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  stable_hash CombinedHash = 0;
+  for (auto File : ObjFiles) {
+    if (File.empty())
+      continue;
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+        File, "in-memory object file", /*RequiresNullTerminator=*/false);
+    Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
+        object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
+    if (!BinOrErr)
+      return BinOrErr.takeError();
+
+    std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(
+            Obj.get(), GlobalOutlineRecord, &CombinedHash))
+      return E;
+  }
+
+  if (!GlobalOutlineRecord.empty())
+    cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+
+  return CombinedHash;
+}
+
 } // end namespace cgdata
 
 } // end namespace llvm

diff  --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index f7f3a8f42af7e1..2f2481ea60f822 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -31,8 +31,8 @@ setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
 }
 
 Error CodeGenDataReader::mergeFromObjectFile(
-    const object::ObjectFile *Obj,
-    OutlinedHashTreeRecord &GlobalOutlineRecord) {
+    const object::ObjectFile *Obj, OutlinedHashTreeRecord &GlobalOutlineRecord,
+    stable_hash *CombinedHash) {
   Triple TT = Obj->makeTriple();
   auto CGOutLineName =
       getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false);
@@ -48,6 +48,9 @@ Error CodeGenDataReader::mergeFromObjectFile(
     auto *EndData = Data + ContentsOrErr->size();
 
     if (*NameOrErr == CGOutLineName) {
+      if (CombinedHash)
+        *CombinedHash =
+            stable_hash_combine(*CombinedHash, xxh3_64bits(*ContentsOrErr));
       // In case dealing with an executable that has concatenated cgdata,
       // we want to merge them into a single cgdata.
       // Although it's not a typical workflow, we support this scenario.

diff  --git a/llvm/lib/LTO/CMakeLists.txt b/llvm/lib/LTO/CMakeLists.txt
index 69ff08e1f374c4..057d73b6349cf1 100644
--- a/llvm/lib/LTO/CMakeLists.txt
+++ b/llvm/lib/LTO/CMakeLists.txt
@@ -21,6 +21,7 @@ add_llvm_component_library(LLVMLTO
   BinaryFormat
   BitReader
   BitWriter
+  CGData
   CodeGen
   CodeGenTypes
   Core

diff  --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index e1714b29399298..8e7675f442567a 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -13,6 +13,7 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/StableHashing.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
@@ -21,6 +22,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -35,6 +37,7 @@
 #include "llvm/Linker/IRMover.h"
 #include "llvm/MC/TargetRegistry.h"
 #include "llvm/Object/IRObjectFile.h"
+#include "llvm/Support/Caching.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -70,6 +73,8 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+extern cl::opt<bool> CodeGenDataThinLTOTwoRounds;
+
 namespace llvm {
 /// Enable global value internalization in LTO.
 cl::opt<bool> EnableLTOInternalization(
@@ -341,6 +346,20 @@ std::string llvm::computeLTOCacheKey(
   return toHex(Hasher.result());
 }
 
+std::string llvm::recomputeLTOCacheKey(const std::string &Key,
+                                       StringRef ExtraID) {
+  SHA1 Hasher;
+
+  auto AddString = [&](StringRef Str) {
+    Hasher.update(Str);
+    Hasher.update(ArrayRef<uint8_t>{0});
+  };
+  AddString(Key);
+  AddString(ExtraID);
+
+  return toHex(Hasher.result());
+}
+
 static void thinLTOResolvePrevailingGUID(
     const Config &C, ValueInfo VI,
     DenseSet<GlobalValueSummary *> &GlobalInvolvedWithAlias,
@@ -1398,6 +1417,7 @@ Error ThinBackendProc::emitFiles(
 
 namespace {
 class InProcessThinBackend : public ThinBackendProc {
+protected:
   AddStreamFn AddStream;
   FileCache Cache;
   DenseSet<GlobalValue::GUID> CfiFunctionDefs;
@@ -1424,7 +1444,7 @@ class InProcessThinBackend : public ThinBackendProc {
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
   }
 
-  Error runThinLTOBackendThread(
+  virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
       ModuleSummaryIndex &CombinedIndex,
       const FunctionImporter::ImportMapTy &ImportList,
@@ -1513,6 +1533,173 @@ class InProcessThinBackend : public ThinBackendProc {
     return Error::success();
   }
 };
+
+/// This backend is utilized in the first round of a two-codegen round process.
+/// It first saves optimized bitcode files to disk before the codegen process
+/// begins. After codegen, it stores the resulting object files in a scratch
+/// buffer. Note the codegen data stored in the scratch buffer will be extracted
+/// and merged in the subsequent step.
+class FirstRoundThinBackend : public InProcessThinBackend {
+  AddStreamFn IRAddStream;
+  FileCache IRCache;
+
+public:
+  FirstRoundThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn CGAddStream, FileCache CGCache, AddStreamFn IRAddStream,
+      FileCache IRCache)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, std::move(CGAddStream),
+                             std::move(CGCache), /*OnWrite=*/nullptr,
+                             /*ShouldEmitIndexFiles=*/false,
+                             /*ShouldEmitImportsFiles=*/false),
+        IRAddStream(std::move(IRAddStream)), IRCache(std::move(IRCache)) {}
+
+  Error runThinLTOBackendThread(
+      AddStreamFn CGAddStream, FileCache CGCache, unsigned Task,
+      BitcodeModule BM, ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    auto RunThinBackend = [&](AddStreamFn CGAddStream,
+                              AddStreamFn IRAddStream) {
+      LTOLLVMContext BackendContext(Conf);
+      Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
+      if (!MOrErr)
+        return MOrErr.takeError();
+
+      return thinBackend(Conf, Task, CGAddStream, **MOrErr, CombinedIndex,
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         Conf.CodeGenOnly, IRAddStream);
+    };
+
+    auto ModuleID = BM.getModuleIdentifier();
+    // Like InProcessThinBackend, we produce index files as needed for
+    // FirstRoundThinBackend. However, these files are not generated for
+    // SecondRoundThinBackend.
+    if (ShouldEmitIndexFiles) {
+      if (auto E = emitFiles(ImportList, ModuleID, ModuleID.str()))
+        return E;
+    }
+
+    assert((CGCache.isValid() == IRCache.isValid()) &&
+           "Both caches for CG and IR should have matching availability");
+    if (!CGCache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
+        all_of(CombinedIndex.getModuleHash(ModuleID),
+               [](uint32_t V) { return V == 0; }))
+      // Cache disabled or no entry for this module in the combined index or
+      // no module hash.
+      return RunThinBackend(CGAddStream, IRAddStream);
+
+    // Get CGKey for caching object in CGCache.
+    std::string CGKey = computeLTOCacheKey(
+        Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
+        DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls);
+    Expected<AddStreamFn> CacheCGAddStreamOrErr =
+        CGCache(Task, CGKey, ModuleID);
+    if (Error Err = CacheCGAddStreamOrErr.takeError())
+      return Err;
+    AddStreamFn &CacheCGAddStream = *CacheCGAddStreamOrErr;
+
+    // Get IRKey for caching (optimized) IR in IRCache with an extra ID.
+    std::string IRKey = recomputeLTOCacheKey(CGKey, /*ExtraID=*/"IR");
+    Expected<AddStreamFn> CacheIRAddStreamOrErr =
+        IRCache(Task, IRKey, ModuleID);
+    if (Error Err = CacheIRAddStreamOrErr.takeError())
+      return Err;
+    AddStreamFn &CacheIRAddStream = *CacheIRAddStreamOrErr;
+
+    // Ideally, both CG and IR caching should be synchronized. However, in
+    // practice, their availability may 
diff er due to 
diff erent expiration
+    // times. Therefore, if either cache is missing, the backend process is
+    // triggered.
+    if (CacheCGAddStream || CacheIRAddStream) {
+      LLVM_DEBUG(dbgs() << "[FirstRound] Cache Miss for "
+                        << BM.getModuleIdentifier() << "\n");
+      return RunThinBackend(CacheCGAddStream ? CacheCGAddStream : CGAddStream,
+                            CacheIRAddStream ? CacheIRAddStream : IRAddStream);
+    }
+
+    return Error::success();
+  }
+};
+
+/// This backend operates in the second round of a two-codegen round process.
+/// It starts by reading the optimized bitcode files that were saved during the
+/// first round. The backend then executes the codegen only to further optimize
+/// the code, utilizing the codegen data merged from the first round. Finally,
+/// it writes the resulting object files as usual.
+class SecondRoundThinBackend : public InProcessThinBackend {
+  std::unique_ptr<SmallVector<StringRef>> IRFiles;
+  stable_hash CombinedCGDataHash;
+
+public:
+  SecondRoundThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn AddStream, FileCache Cache,
+      std::unique_ptr<SmallVector<StringRef>> IRFiles,
+      stable_hash CombinedCGDataHash)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, std::move(AddStream),
+                             std::move(Cache),
+                             /*OnWrite=*/nullptr,
+                             /*ShouldEmitIndexFiles=*/false,
+                             /*ShouldEmitImportsFiles=*/false),
+        IRFiles(std::move(IRFiles)), CombinedCGDataHash(CombinedCGDataHash) {}
+
+  virtual Error runThinLTOBackendThread(
+      AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+      ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    auto RunThinBackend = [&](AddStreamFn AddStream) {
+      LTOLLVMContext BackendContext(Conf);
+      std::unique_ptr<Module> LoadedModule =
+          cgdata::loadModuleForTwoRounds(BM, Task, BackendContext, *IRFiles);
+
+      return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         /*CodeGenOnly=*/true);
+    };
+
+    auto ModuleID = BM.getModuleIdentifier();
+    if (!Cache.isValid() || !CombinedIndex.modulePaths().count(ModuleID) ||
+        all_of(CombinedIndex.getModuleHash(ModuleID),
+               [](uint32_t V) { return V == 0; }))
+      // Cache disabled or no entry for this module in the combined index or
+      // no module hash.
+      return RunThinBackend(AddStream);
+
+    // Get Key for caching the final object file in Cache with the combined
+    // CGData hash.
+    std::string Key = computeLTOCacheKey(
+        Conf, CombinedIndex, ModuleID, ImportList, ExportList, ResolvedODR,
+        DefinedGlobals, CfiFunctionDefs, CfiFunctionDecls);
+    Key = recomputeLTOCacheKey(Key,
+                               /*ExtraID=*/std::to_string(CombinedCGDataHash));
+    Expected<AddStreamFn> CacheAddStreamOrErr = Cache(Task, Key, ModuleID);
+    if (Error Err = CacheAddStreamOrErr.takeError())
+      return Err;
+    AddStreamFn &CacheAddStream = *CacheAddStreamOrErr;
+
+    if (CacheAddStream) {
+      LLVM_DEBUG(dbgs() << "[SecondRound] Cache Miss for "
+                        << BM.getModuleIdentifier() << "\n");
+      return RunThinBackend(CacheAddStream);
+    }
+
+    return Error::success();
+  }
+};
 } // end anonymous namespace
 
 ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1855,10 +2042,50 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     return BackendProcess->wait();
   };
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-  return RunBackends(BackendProc.get());
+  if (!CodeGenDataThinLTOTwoRounds) {
+    std::unique_ptr<ThinBackendProc> BackendProc =
+        ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                        AddStream, Cache);
+    return RunBackends(BackendProc.get());
+  }
+
+  // Perform two rounds of code generation for ThinLTO:
+  // 1. First round: Perform optimization and code generation, outputting to
+  // temporary scratch objects.
+  // 2. Merge code generation data extracted from the temporary scratch objects.
+  // 3. Second round: Execute code generation again using the merged data.
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Initializing ThinLTO two-codegen rounds\n");
+
+  unsigned MaxTasks = getMaxTasks();
+  auto Parallelism = ThinLTO.Backend.getParallelism();
+  // Set up two additional streams and caches for storing temporary scratch
+  // objects and optimized IRs, using the same cache directory as the original.
+  cgdata::StreamCacheData CG(MaxTasks, Cache, "CG"), IR(MaxTasks, Cache, "IR");
+
+  // First round: Execute optimization and code generation, outputting to
+  // temporary scratch objects. Serialize the optimized IRs before initiating
+  // code generation.
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Running the first round of codegen\n");
+  auto FirstRoundLTO = std::make_unique<FirstRoundThinBackend>(
+      Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
+      CG.AddStream, CG.Cache, IR.AddStream, IR.Cache);
+  if (Error E = RunBackends(FirstRoundLTO.get()))
+    return E;
+
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Merging codegen data\n");
+  auto CombinedHashOrErr = cgdata::mergeCodeGenData(*CG.getResult());
+  if (Error E = CombinedHashOrErr.takeError())
+    return E;
+  auto CombinedHash = *CombinedHashOrErr;
+  LLVM_DEBUG(dbgs() << "[TwoRounds] CGData hash: " << CombinedHash << "\n");
+
+  // Second round: Read the optimized IRs and execute code generation using the
+  // merged data.
+  LLVM_DEBUG(dbgs() << "[TwoRounds] Running the second round of codegen\n");
+  auto SecondRoundLTO = std::make_unique<SecondRoundThinBackend>(
+      Conf, ThinLTO.CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
+      AddStream, Cache, std::move(IR.getResult()), CombinedHash);
+  return RunBackends(SecondRoundLTO.get());
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(

diff  --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 06eeed3e1bc41f..ad332d25d9c082 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@@ -565,7 +566,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
-                       bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
+                       bool CodeGenOnly, AddStreamFn IRAddStream,
+                       const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -599,11 +601,19 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   auto OptimizeAndCodegen =
       [&](Module &Mod, TargetMachine *TM,
           std::unique_ptr<ToolOutputFile> DiagnosticOutputFile) {
+        // Perform optimization and code generation for ThinLTO.
         if (!opt(Conf, TM, Task, Mod, /*IsThinLTO=*/true,
                  /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex,
                  CmdArgs))
           return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
+        // Save the current module before the first codegen round.
+        // Note that the second codegen round runs only `codegen()` without
+        // running `opt()`. We're not reaching here as it's bailed out earlier
+        // with `CodeGenOnly` which has been set in `SecondRoundThinBackend`.
+        if (IRAddStream)
+          cgdata::saveModuleForTwoRounds(Mod, Task, IRAddStream);
+
         codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       };

diff  --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll
new file mode 100644
index 00000000000000..e11903bf0f3bf8
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds-caching.ll
@@ -0,0 +1,179 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; by running two codegen rounds.
+; This test also verifies if caches for the two-round codegens are correctly working.
+
+; REQUIRES: asserts
+; RUN: rm -rf %t
+; RUN: split-file %s %t
+
+; 0. Base case without a cache.
+; Verify each outlining instance is singleton with the global outlining for thinlto.
+; They will be identical, which can be folded by the linker with ICF.
+; RUN: opt -module-hash -module-summary %t/thin-one.ll -o %t/thin-one.bc
+; RUN: opt -module-hash -module-summary %t/thin-two.ll -o %t/thin-two.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1
+; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  b
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+
+; 1. Run this with a cache for the first time.
+; RUN: rm -rf %t.cache
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-cold \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-cold.txt 2>&1
+; RUN: cat %t.log-cold.txt | FileCheck %s --check-prefix=COLD
+; 
diff  %t/thinlto.1 %t/thinlto-cold.1
+; 
diff  %t/thinlto.2 %t/thinlto-cold.2
+
+; COLD: [FirstRound] Cache Miss for {{.*}}thin-one.bc
+; COLD: [FirstRound] Cache Miss for {{.*}}thin-two.bc
+; COLD: [SecondRound] Cache Miss for {{.*}}thin-one.bc
+; COLD: [SecondRound] Cache Miss for {{.*}}thin-two.bc
+
+; There are two input bitcode files and each one is operated with 3 caches:
+; CG/IR caches for the first round and the second round CG cache.
+; So the total number of files are 2 * 3 = 6.
+; RUN: ls %t.cache | count 6
+
+; 2. Without any changes, simply re-running it will hit the cache.
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm.txt 2>&1
+; RUN: cat %t.log-warm.txt | FileCheck %s --check-prefix=WARM
+; 
diff  %t/thinlto.1 %t/thinlto-warm.1
+; 
diff  %t/thinlto.2 %t/thinlto-warm.2
+
+; WARM-NOT: Cache Miss
+
+; 3. Assume thin-one.ll has been modified to thin-one-modified.ll.
+; The merged CG data remains unchanged as this modification does not affect the hash tree built from thin-two.bc.
+; Therefore, both the first and second round runs update only this module.
+; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified.txt 2>&1
+; RUN: cat %t.log-warm-modified.txt | FileCheck %s --check-prefix=WARM-MODIFIED
+; 
diff  %t/thinlto.1 %t/thinlto-warm-modified.1
+; 
diff  %t/thinlto.2 %t/thinlto-warm-modified.2
+
+; WARM-MODIFIED: [FirstRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-NOT: [FirstRound] Cache Miss for {{.*}}thin-two.bc
+; WARM-MODIFIED: [SecondRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-NOT: [SecondRound] Cache Miss for {{.*}}thin-two.bc
+
+; 4. Additionally, thin-two.ll has been modified to thin-two-modified.ll.
+; In this case, the merged CG data, which is global, is updated.
+; Although the first round run updates only the thin-two.bc module,
+; as the module thin-one.bc remains the same as in step 3 above,
+; the second round run will update all modules, resulting in 
diff erent binaries.
+; RUN: opt -module-hash -module-summary %t/thin-one-modified.ll -o %t/thin-one.bc
+; RUN: opt -module-hash -module-summary %t/thin-two-modified.ll -o %t/thin-two.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-modified-all \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-modified-all.txt 2>&1
+; RUN: cat %t.log-warm-modified-all.txt | FileCheck %s --check-prefix=WARM-MODIFIED-ALL
+; RUN: not 
diff  %t/thinlto.1 %t/thinlto-warm-modified-all.1
+; RUN: not 
diff  %t/thinlto.2 %t/thinlto-warm-modified-all.2
+
+; WARM-MODIFIED-ALL-NOT: [FirstRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-ALL: [FirstRound] Cache Miss for {{.*}}thin-two.bc
+; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-one.bc
+; WARM-MODIFIED-ALL: [SecondRound] Cache Miss for {{.*}}thin-two.bc
+
+; thin-one-modified.ll won't be outlined.
+; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.1 | FileCheck %s --check-prefix=THINLTO-1-MODIFIED-ALL
+; THINLTO-1-MODIFIED-ALL-NOT: _OUTLINED_FUNCTION{{.*}}>:
+
+; thin-two-modified.ll will have two (longer) outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto-warm-modified-all.2| FileCheck %s --check-prefix=THINLTO-2-MODIFIED-ALL
+; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  b
+; THINLTO-2-MODIFIED-ALL: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  mov
+; THINLTO-2-MODIFIED-ALL:  b
+
+; 5. Re-running it will hit the cache.
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto-warm-again \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds -cache-dir %t.cache -debug-only=lto -thinlto-threads 1 > %t.log-warm-again.txt 2>&1
+; RUN: cat %t.log-warm-again.txt | FileCheck %s --check-prefix=WARM-AGAIN
+; RUN: 
diff  %t/thinlto-warm-modified-all.1 %t/thinlto-warm-again.1
+; RUN: 
diff  %t/thinlto-warm-modified-all.2 %t/thinlto-warm-again.2
+
+; WARM-AGAIN-NOT: Cache Miss
+
+;--- thin-one.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-one-modified.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 31, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- thin-two-modified.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}

diff  --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
new file mode 100644
index 00000000000000..ada1c6bb9a8421
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
@@ -0,0 +1,105 @@
+; This test checks if we can outline a singleton instance (i.e., an instance that
+; does not repeat) through two codegen rounds. The first round identifies a local
+; outlining instance within thin-two.ll, which is then encoded in the resulting
+; object file and merged into the codegen data summary.
+; The second round utilizes the merged codegen data to optimistically outline a
+; singleton instance in thin-one.ll.
+; Note that this global outlining creates a unique instance for each sequence
+; without directly sharing identical functions for correctness.
+; Actual code size reductions occur at link time through identical code folding.
+; When both thinlto and lto modules are compiled, the lto module is processed
+; independently, without relying on the merged codegen data. In this case,
+; the identical code sequences are directly replaced by a common outlined function.
+
+; RUN: split-file %s %t
+
+; Verify each outlining instance is singleton with the global outlining for thinlto.
+; They will be identical, which can be folded by the linker with ICF.
+; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc
+; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; thin-one.ll will have one outlining instance itself (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1
+; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  b
+
+; thin-two.ll will have two respective outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+
+; Now add a lto module to the above thinlto modules.
+; Verify the lto module is optimized independent of the global outlining for thinlto.
+; RUN: opt %t/lto.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds
+
+; lto.ll will have one shared outlining instance within the lto module itself (no global outlining).
+; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0
+; LTO-0: _OUTLINED_FUNCTION{{.*}}>:
+; LTO-0-NEXT:  mov
+; LTO-0-NEXT:  b
+; LTO-0-NOT: _OUTLINED_FUNCTION{{.*}}>:
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2
+
+;--- thin-one.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- lto.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 10, i32 30, i32 2);
+  ret i32 %1
+}
+define i32 @f5() minsize {
+  %1 = call i32 @g(i32 20, i32 40, i32 2);
+  ret i32 %1
+}
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 50, i32 60, i32 2);
+  ret i32 %1
+}

diff  --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg
new file mode 100644
index 00000000000000..10d4a0e953ed47
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True