[clang] [lld] [llvm] Thin2 (PR #106602)

Thu Aug 29 13:24:31 PDT 2024

https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/106602

>From 561eb1810f04f373410ba2f37f846eafe46515dc Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Wed, 24 Apr 2024 11:26:23 -0700
Subject: [PATCH 1/6] [CGData][MachineOutliner] Global Outlining2

This commit introduces support for outlining functions across modules using codegen data generated from previous codegen. The codegen data currently manages the outlined hash tree, which records outlining instances that occurred locally in the past.

The machine outliner now operates in one of three modes:
1. CGDataMode::None: This is the default outliner mode that uses the suffix tree to identify (local) outlining candidates within a module. This mode is also used by (full)LTO to maintain optimal behavior with the combined module.
2. CGDataMode::Write (`codegen-data-generate`): This mode is identical to the default mode, but it also publishes the stable hash sequences of instructions in the outlined functions into a local outlined hash tree. It then encodes this into the `__llvm_outline` section, which will be dead-stripped at link time.
3. CGDataMode::Read (`codegen-data-use-path={.cgdata}`): This mode reads a codegen data file (.cgdata) and initializes a global outlined hash tree. This tree is used to generate global outlining candidates. Note that the codegen data file has been post-processed with the raw `__llvm_outline` sections from all native objects using the `llvm-cgdata` tool (or a linker, `LLD`, or a new ThinLTO pipeline later).
---
 llvm/include/llvm/ADT/StableHashing.h         |   6 +
 llvm/include/llvm/CodeGen/MachineOutliner.h   |  40 ++-
 llvm/lib/CGData/CodeGenData.cpp               |  26 +-
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 llvm/lib/CodeGen/MachineOutliner.cpp          | 261 +++++++++++++++++-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 .../CodeGen/AArch64/cgdata-global-hash.ll     |  40 +++
 .../CodeGen/AArch64/cgdata-outlined-name.ll   |  41 +++
 .../AArch64/cgdata-read-double-outline.ll     |  57 ++++
 .../AArch64/cgdata-read-lto-outline.ll        |  96 +++++++
 .../CodeGen/AArch64/cgdata-read-priority.ll   |  68 +++++
 .../cgdata-read-single-outline-suffix.ll      | 100 +++++++
 .../AArch64/cgdata-read-single-outline.ll     |  42 +++
 .../CodeGen/AArch64/cgdata-write-outline.ll   |  51 ++++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 +
 llvm/unittests/MIR/MachineStableHashTest.cpp  |  70 +++++
 16 files changed, 897 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-single-outline-suffix.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-write-outline.ll

diff --git a/llvm/include/llvm/ADT/StableHashing.h b/llvm/include/llvm/ADT/StableHashing.h
index 7852199f8b0a00..b220a0ed1f9131 100644
--- a/llvm/include/llvm/ADT/StableHashing.h
+++ b/llvm/include/llvm/ADT/StableHashing.h
@@ -53,6 +53,12 @@ inline stable_hash stable_hash_combine(stable_hash A, stable_hash B,
 // Removes suffixes introduced by LLVM from the name to enhance stability and
 // maintain closeness to the original name across different builds.
 inline StringRef get_stable_name(StringRef Name) {
+  // Return the part after ".content." that represents contents.
+  auto [P0, S0] = Name.rsplit(".content.");
+  if (!S0.empty())
+    return S0;
+
+  // Ignore these suffixes.
   auto [P1, S1] = Name.rsplit(".llvm.");
   auto [P2, S2] = P1.rsplit(".__uniq.");
   return P2;
diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index eaba6c9b18f2bb..fbb958ccf6488e 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineStableHash.h"
 #include <initializer_list>
 
 namespace llvm {
@@ -234,11 +235,11 @@ struct OutlinedFunction {
   unsigned FrameConstructionID = 0;
 
   /// Return the number of candidates for this \p OutlinedFunction.
-  unsigned getOccurrenceCount() const { return Candidates.size(); }
+  virtual unsigned getOccurrenceCount() const { return Candidates.size(); }
 
   /// Return the number of bytes it would take to outline this
   /// function.
-  unsigned getOutliningCost() const {
+  virtual unsigned getOutliningCost() const {
     unsigned CallOverhead = 0;
     for (const Candidate &C : Candidates)
       CallOverhead += C.getCallOverhead();
@@ -272,7 +273,42 @@ struct OutlinedFunction {
   }
 
   OutlinedFunction() = delete;
+  virtual ~OutlinedFunction() = default;
 };
+
+/// The information necessary to create an outlined function that is matched
+/// globally.
+struct GlobalOutlinedFunction : public OutlinedFunction {
+  explicit GlobalOutlinedFunction(std::unique_ptr<OutlinedFunction> OF,
+                                  unsigned GlobalOccurrenceCount)
+      : OutlinedFunction(*OF), GlobalOccurrenceCount(GlobalOccurrenceCount) {}
+
+  unsigned GlobalOccurrenceCount;
+
+  /// Return the number of times that appear globally.
+  /// Global outlining candidate is uniquely created per each match, but this
+  /// might be erased out when it's overlapped with the previous outlining
+  /// instance.
+  unsigned getOccurrenceCount() const override {
+    assert(Candidates.size() <= 1);
+    return Candidates.empty() ? 0 : GlobalOccurrenceCount;
+  }
+
+  /// Return the outlining cost using the global occurrence count
+  /// with the same cost as the first (unique) candidate.
+  unsigned getOutliningCost() const override {
+    assert(Candidates.size() <= 1);
+    unsigned CallOverhead =
+        Candidates.empty()
+            ? 0
+            : Candidates[0].getCallOverhead() * getOccurrenceCount();
+    return CallOverhead + SequenceSize + FrameOverhead;
+  }
+
+  GlobalOutlinedFunction() = delete;
+  ~GlobalOutlinedFunction() = default;
+};
+
 } // namespace outliner
 } // namespace llvm
 
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 9dd4b1674e094a..55d2504231c744 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -24,6 +24,13 @@
 using namespace llvm;
 using namespace cgdata;
 
+cl::opt<bool>
+    CodeGenDataGenerate("codegen-data-generate", cl::init(false), cl::Hidden,
+                        cl::desc("Emit CodeGen Data into custom sections"));
+cl::opt<std::string>
+    CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
+                       cl::desc("File path to where .cgdata file is read"));
+
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
   std::string Msg;
@@ -132,7 +139,24 @@ CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
     Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
-    // TODO: Initialize writer or reader mode for the client optimization.
+    if (CodeGenDataGenerate)
+      Instance->EmitCGData = true;
+    else if (!CodeGenDataUsePath.empty()) {
+      // Initialize the global CGData if the input file name is given.
+      // We do not error-out when failing to parse the input file.
+      // Instead, just emit an warning message and fall back as if no CGData
+      // were available.
+      auto FS = vfs::getRealFileSystem();
+      auto ReaderOrErr = CodeGenDataReader::create(CodeGenDataUsePath, *FS);
+      if (Error E = ReaderOrErr.takeError()) {
+        warn(std::move(E), CodeGenDataUsePath);
+        return;
+      }
+      // Publish each CGData based on the data type in the header.
+      auto Reader = ReaderOrErr->get();
+      if (Reader->hasOutlinedHashTree())
+        Instance->publishOutlinedHashTree(Reader->releaseOutlinedHashTree());
+    }
   });
   return *(Instance.get());
 }
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index ae12ce1170f703..5a17944db0ae03 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -268,6 +268,7 @@ add_llvm_component_library(LLVMCodeGen
   Analysis
   BitReader
   BitWriter
+  CGData
   CodeGenTypes
   Core
   MC
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 42f410c277179b..7736df2def77bc 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -59,7 +59,9 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CGData/CodeGenDataReader.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
@@ -75,6 +77,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SuffixTree.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <functional>
 #include <tuple>
 #include <vector>
@@ -98,6 +101,10 @@ STATISTIC(NumInvisible,
           "Invisible instructions skipped during mapping");
 STATISTIC(UnsignedVecSize,
           "Total number of instructions mapped and saved to mapping vector");
+STATISTIC(StableHashAttempts,
+          "Count of hashing attempts made for outlined functions");
+STATISTIC(StableHashDropped,
+          "Count of unsuccessful hashing attempts for outlined functions");
 
 // Set to true if the user wants the outliner to run on linkonceodr linkage
 // functions. This is false by default because the linker can dedupe linkonceodr
@@ -128,6 +135,19 @@ static cl::opt<bool> OutlinerLeafDescendants(
              "tree as candidates for outlining (if false, only leaf children "
              "are considered)"));
 
+static cl::opt<bool>
+    DisableGlobalOutlining("disable-global-outlining", cl::Hidden,
+                           cl::desc("Disable global outlining only by ignoring "
+                                    "the codegen data generation or use"),
+                           cl::init(false));
+
+static cl::opt<bool> AppendContentHashToOutlinedName(
+    "append-content-hash-outlined-name", cl::Hidden,
+    cl::desc("This appends the content hash to the globally outlined function "
+             "name. It's beneficial for enhancing the precision of the stable "
+             "hash and for ordering the outlined functions."),
+    cl::init(true));
+
 namespace {
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -421,11 +441,29 @@ struct MachineOutliner : public ModulePass {
   /// Set when the pass is constructed in TargetPassConfig.
   bool RunOnAllFunctions = true;
 
+  /// This is a compact representation of hash sequences of outlined functions.
+  /// It is used when OutlinerMode = CGDataMode::Write.
+  /// The resulting hash tree will be emitted into __llvm_outlined section
+  /// which will be dead-stripped not going to the final binary.
+  /// A post-process using llvm-cgdata, lld, or ThinLTO can merge them into
+  /// a global oulined hash tree for the subsequent codegen.
+  std::unique_ptr<OutlinedHashTree> LocalHashTree;
+
+  /// The mode of the outliner.
+  /// When is's CGDataMode::None, candidates are populated with the suffix tree
+  /// within a module and outlined.
+  /// When it's CGDataMode::Write, in addition to CGDataMode::None, the hash
+  /// sequences of outlined functions are published into LocalHashTree.
+  /// When it's CGDataMode::Read, candidates are populated with the global
+  /// outlined hash tree that has been built by the previous codegen.
+  CGDataMode OutlinerMode = CGDataMode::None;
+
   StringRef getPassName() const override { return "Machine Outliner"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.addRequired<ImmutableModuleSummaryIndexWrapperPass>();
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
@@ -460,6 +498,16 @@ struct MachineOutliner : public ModulePass {
   findCandidates(InstructionMapper &Mapper,
                  std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
 
+  /// Find all repeated substrings that match in the global outlined hash
+  /// tree built from the previous codegen.
+  ///
+  /// \param Mapper Contains outlining mapping information.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
+  /// each type of candidate.
+  void findGlobalCandidates(
+      InstructionMapper &Mapper,
+      std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
+
   /// Replace the sequences of instructions represented by \p OutlinedFunctions
   /// with calls to functions.
   ///
@@ -476,6 +524,17 @@ struct MachineOutliner : public ModulePass {
                                           InstructionMapper &Mapper,
                                           unsigned Name);
 
+  /// Compute and publish the stable hash sequence of instructions in the
+  /// outlined function, \p MF. The parameter \p CandSize represents the number
+  /// of candidates that have identical instruction sequences to \p MF.
+  void computeAndPublishHashSequence(MachineFunction &MF, unsigned CandSize);
+
+  /// Initialize the outliner mode.
+  void initializeOutlinerMode(const Module &M);
+
+  /// Emit the outlined hash tree into __llvm_outline section.
+  void emitOutlinedHashTree(Module &M);
+
   /// Calls 'doOutline()' 1 + OutlinerReruns times.
   bool runOnModule(Module &M) override;
 
@@ -585,6 +644,111 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
   MORE.emit(R);
 }
 
+struct MatchedEntry {
+  size_t StartIdx;
+  size_t Length;
+  size_t Count;
+};
+
+static const HashNode *followHashNode(stable_hash StableHash,
+                                      const HashNode *Current) {
+  auto I = Current->Successors.find(StableHash);
+  return (I == Current->Successors.end()) ? nullptr : I->second.get();
+}
+
+// Find all matches in the global outlined hash tree.
+// It's quadratic complexity in theory, but it's nearly linear in practice
+// since the length of outlined sequences are small within a block.
+static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
+  auto &InstrList = Mapper.InstrList;
+  auto &UnsignedVec = Mapper.UnsignedVec;
+
+  std::vector<MatchedEntry> MatchedEntries;
+  std::vector<stable_hash> Sequence;
+  auto Size = UnsignedVec.size();
+
+  // Get the global outlined hash tree built from the previous run.
+  assert(cgdata::hasOutlinedHashTree());
+  const auto *RootNode = cgdata::getOutlinedHashTree()->getRoot();
+  for (size_t I = 0; I < Size; ++I) {
+    // skip the invalid mapping that represents a large negative value.
+    if (UnsignedVec[I] >= Size)
+      continue;
+    const MachineInstr &MI = *InstrList[I];
+    // skip debug instructions as we did for the outlined function.
+    if (MI.isDebugInstr())
+      continue;
+    // skip the empty hash value.
+    stable_hash StableHashI = stableHashValue(MI);
+    if (!StableHashI)
+      continue;
+    Sequence.clear();
+    Sequence.push_back(StableHashI);
+
+    const HashNode *LastNode = followHashNode(StableHashI, RootNode);
+    if (!LastNode)
+      continue;
+
+    size_t J = I + 1;
+    for (; J < Size; ++J) {
+      // break on the invalid mapping that represents a large negative value.
+      if (UnsignedVec[J] >= Size)
+        break;
+      // ignore debug instructions as we did for the outlined function.
+      const MachineInstr &MJ = *InstrList[J];
+      if (MJ.isDebugInstr())
+        continue;
+      // break on the empty hash value.
+      stable_hash StableHashJ = stableHashValue(MJ);
+      if (!StableHashJ)
+        break;
+      LastNode = followHashNode(StableHashJ, LastNode);
+      if (!LastNode)
+        break;
+
+      // Even with a match ending with a terminal, we continue finding
+      // matches to populate all candidates.
+      Sequence.push_back(StableHashJ);
+      auto Count = LastNode->Terminals;
+      if (Count)
+        MatchedEntries.push_back({I, J - I + 1, *Count});
+    }
+  }
+
+  return MatchedEntries;
+}
+
+void MachineOutliner::findGlobalCandidates(
+    InstructionMapper &Mapper,
+    std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
+  FunctionList.clear();
+  auto &InstrList = Mapper.InstrList;
+  auto &MBBFlagsMap = Mapper.MBBFlagsMap;
+
+  std::vector<Candidate> CandidatesForRepeatedSeq;
+  for (auto &ME : getMatchedEntries(Mapper)) {
+    CandidatesForRepeatedSeq.clear();
+    MachineBasicBlock::iterator StartIt = InstrList[ME.StartIdx];
+    MachineBasicBlock::iterator EndIt = InstrList[ME.StartIdx + ME.Length - 1];
+    MachineBasicBlock *MBB = StartIt->getParent();
+    CandidatesForRepeatedSeq.emplace_back(ME.StartIdx, ME.Length, StartIt,
+                                          EndIt, MBB, FunctionList.size(),
+                                          MBBFlagsMap[MBB]);
+    const TargetInstrInfo *TII =
+        MBB->getParent()->getSubtarget().getInstrInfo();
+    unsigned MinRepeats = 1;
+    std::optional<std::unique_ptr<OutlinedFunction>> OF =
+        TII->getOutliningCandidateInfo(*MMI, CandidatesForRepeatedSeq,
+                                       MinRepeats);
+    if (!OF.has_value() || OF.value()->Candidates.empty())
+      continue;
+    // We create a global candidate each match.
+    assert(OF.value()->Candidates.size() == MinRepeats);
+    FunctionList.emplace_back(std::make_unique<GlobalOutlinedFunction>(
+        std::move(OF.value()), ME.Count));
+  }
+}
+
 void MachineOutliner::findCandidates(
     InstructionMapper &Mapper,
     std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
@@ -695,6 +859,39 @@ void MachineOutliner::findCandidates(
   }
 }
 
+void MachineOutliner::computeAndPublishHashSequence(MachineFunction &MF,
+                                                    unsigned CandSize) {
+  // Compute the hash sequence for the outlined function.
+  SmallVector<stable_hash> OutlinedHashSequence;
+  for (auto &MBB : MF) {
+    for (auto &NewMI : MBB) {
+      stable_hash Hash = stableHashValue(NewMI);
+      if (!Hash) {
+        OutlinedHashSequence.clear();
+        break;
+      }
+      OutlinedHashSequence.push_back(Hash);
+    }
+  }
+
+  // Append a unique name based on the non-empty hash sequence.
+  if (AppendContentHashToOutlinedName && !OutlinedHashSequence.empty()) {
+    auto CombinedHash = stable_hash_combine(OutlinedHashSequence);
+    auto NewName =
+        MF.getName().str() + ".content." + std::to_string(CombinedHash);
+    MF.getFunction().setName(NewName);
+  }
+
+  // Publish the non-empty hash sequence to the local hash tree.
+  if (OutlinerMode == CGDataMode::Write) {
+    StableHashAttempts++;
+    if (!OutlinedHashSequence.empty())
+      LocalHashTree->insert({OutlinedHashSequence, CandSize});
+    else
+      StableHashDropped++;
+  }
+}
+
 MachineFunction *MachineOutliner::createOutlinedFunction(
     Module &M, OutlinedFunction &OF, InstructionMapper &Mapper, unsigned Name) {
 
@@ -770,6 +967,9 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
     }
   }
 
+  if (OutlinerMode != CGDataMode::None)
+    computeAndPublishHashSequence(MF, OF.Candidates.size());
+
   // Set normal properties for a late MachineFunction.
   MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
   MF.getProperties().set(MachineFunctionProperties::Property::NoPHIs);
@@ -1134,12 +1334,65 @@ void MachineOutliner::emitInstrCountChangedRemark(
   }
 }
 
+void MachineOutliner::initializeOutlinerMode(const Module &M) {
+  if (DisableGlobalOutlining)
+    return;
+
+  if (auto *IndexWrapperPass =
+          getAnalysisIfAvailable<ImmutableModuleSummaryIndexWrapperPass>()) {
+    auto *TheIndex = IndexWrapperPass->getIndex();
+    // (Full)LTO module does not have functions added to the index.
+    // In this case, we run the outliner without using codegen data as usual.
+    if (TheIndex && !TheIndex->hasExportedFunctions(M))
+      return;
+  }
+
+  // When codegen data write is enabled, we want to write the local outlined
+  // hash tree to the custom section, `__llvm_outline`.
+  // When the outlined hash tree is available from the previous codegen data,
+  // we want to read it to optimistically create global outlining candidates.
+  if (cgdata::emitCGData()) {
+    OutlinerMode = CGDataMode::Write;
+    // Create a local outlined hash tree to be published.
+    LocalHashTree.reset(new OutlinedHashTree());
+    // We don't need to read the outlined hash tree from the previous codegen
+  } else if (cgdata::hasOutlinedHashTree())
+    OutlinerMode = CGDataMode::Read;
+}
+
+void MachineOutliner::emitOutlinedHashTree(Module &M) {
+  assert(LocalHashTree);
+  if (!LocalHashTree->empty()) {
+    LLVM_DEBUG({
+      dbgs() << "Emit outlined hash tree. Size: " << LocalHashTree->size()
+             << "\n";
+    });
+    SmallVector<char> Buf;
+    raw_svector_ostream OS(Buf);
+
+    OutlinedHashTreeRecord HTR(std::move(LocalHashTree));
+    HTR.serialize(OS);
+
+    llvm::StringRef Data(Buf.data(), Buf.size());
+    std::unique_ptr<MemoryBuffer> Buffer =
+        MemoryBuffer::getMemBuffer(Data, "in-memory outlined hash tree", false);
+
+    Triple TT(M.getTargetTriple());
+    embedBufferInModule(
+        M, *Buffer.get(),
+        getCodeGenDataSectionName(CG_outline, TT.getObjectFormat()));
+  }
+}
+
 bool MachineOutliner::runOnModule(Module &M) {
   // Check if there's anything in the module. If it's empty, then there's
   // nothing to outline.
   if (M.empty())
     return false;
 
+  // Initialize the outliner mode.
+  initializeOutlinerMode(M);
+
   MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
 
   // Number to append to the current outlined function.
@@ -1161,6 +1414,9 @@ bool MachineOutliner::runOnModule(Module &M) {
     }
   }
 
+  if (OutlinerMode == CGDataMode::Write)
+    emitOutlinedHashTree(M);
+
   return true;
 }
 
@@ -1189,7 +1445,10 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   std::vector<std::unique_ptr<OutlinedFunction>> FunctionList;
 
   // Find all of the outlining candidates.
-  findCandidates(Mapper, FunctionList);
+  if (OutlinerMode == CGDataMode::Read)
+    findGlobalCandidates(Mapper, FunctionList);
+  else
+    findCandidates(Mapper, FunctionList);
 
   // If we've requested size remarks, then collect the MI counts of every
   // function before outlining, and the MI counts after outlining.
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index ffbe3dd377109f..f002ef327d44c5 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -16,6 +16,7 @@
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
+; CHECK-NEXT: Module summary info
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
diff --git a/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
new file mode 100644
index 00000000000000..c425eda56f5d5b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
@@ -0,0 +1,40 @@
+; This test verifies the stable hash values for different global variables
+; that have distinct names.
+; We generate two different cgdata files from nearly identical outline instances,
+; with the only difference being the last call target globals, @g vs @h.
+
+; RUN: split-file %s %t
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-g.ll -o %t/local-g.o
+; RUN: llvm-cgdata --merge %t/local-g.o -o %t/local-g.cgdata
+; RUN: llvm-cgdata --convert %t/local-g.cgdata -o %t/local-g.cgtext
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-h.ll -o %t/local-h.o
+; RUN: llvm-cgdata --merge %t/local-h.o -o %t/local-h.cgdata
+; RUN: llvm-cgdata --convert %t/local-h.cgdata -o %t/local-h.cgtext
+
+; We compare the trees which are only different at the terminal node's hash value.
+; Here we simply count the different lines that have `Hash` string.
+; RUN: not diff %t/local-g.cgtext %t/local-h.cgtext 2>&1 | grep Hash | wc -l | FileCheck %s
+; CHECK: 2
+
+;--- local-g.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-h.ll
+declare i32 @h(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @h(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @h(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll b/llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll
new file mode 100644
index 00000000000000..69f1ecd6515e7e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-outlined-name.ll
@@ -0,0 +1,41 @@
+; This test verifies the globally outlined function name has the content hash.
+
+; RUN: split-file %s %t
+
+; Check if the outlined function name has the content hash depending the flag.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -append-content-hash-outlined-name=false -filetype=obj %t/local-two.ll -o %t_write_base
+; RUN: llvm-objdump -d %t_write_base | FileCheck %s --check-prefix=BASE
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -append-content-hash-outlined-name=true -filetype=obj %t/local-two.ll -o %t_write_suffix
+; RUN: llvm-objdump -d %t_write_suffix | FileCheck %s --check-prefix=SUFFIX
+; BASE-NOT: _OUTLINED_FUNCTION_{{.*}}.content.{{[0-9]+}}
+; SUFFIX: _OUTLINED_FUNCTION_{{.*}}.content.{{[0-9]+}}
+
+; Generate the cgdata file from each case and show they are identical.
+; RUN: llvm-cgdata --merge %t_write_base -o %t_cgdata_base
+; RUN: llvm-cgdata --merge %t_write_suffix -o %t_cgdata_suffix
+; RUN: diff %t_cgdata_base %t_cgdata_suffix
+
+; Read the cgdata in the machine outliner for optimistically outlining in local-one.ll.
+; Check if the outlined function has the content hash depending the flag.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata_base -append-content-hash-outlined-name=false -filetype=obj %t/local-one.ll -o %t_read_base
+; RUN: llvm-objdump -d %t_read_base | FileCheck %s --check-prefix=BASE
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata_suffix -append-content-hash-outlined-name=true -filetype=obj %t/local-one.ll -o %t_read_suffix
+; RUN: llvm-objdump -d %t_read_suffix | FileCheck %s --check-prefix=SUFFIX
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
new file mode 100644
index 00000000000000..6e027308c17068
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
@@ -0,0 +1,57 @@
+; This test demonstrates how identical instruction sequences are handled during global outlining.
+; Currently, we do not attempt to share an outlined function for identical sequences.
+; Instead, each instruction sequence that matches against the global outlined hash tree
+; is outlined into its own unique function.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata --merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata for local-two-another.ll and proceed to optimistically outline
+; each instruction sequence that matches against the global outlined hash tree.
+; Since each matching sequence is considered a candidate, we expect to generate two
+; unique outlined functions. These functions, although unique, will be identical in code,
+; and thus, will be folded by the linker.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-two-another.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-two-another.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
new file mode 100644
index 00000000000000..f1a5d1a0ccc7f0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
@@ -0,0 +1,96 @@
+; This test is similar to cgdata-read-double-outline.ll, but it is executed with LTO (Link Time Optimization).
+; It demonstrates how identical instruction sequences are handled during global outlining.
+; Currently, we do not attempt to reuse an outlined function for identical sequences.
+; Instead, each instruction sequence that appears in the global outlined hash tree
+; is outlined into its own unique function.
+
+; RUN: split-file %s %t
+
+; We first create the cgdata file from a local outline instance in local-two.ll
+; RUN: opt -module-summary %t/local-two.ll -o %t/write.bc
+; RUN: llvm-lto2 run %t/write.bc -o %t/write \
+; RUN:  -r %t/write.bc,_f1,px -r %t/write.bc,_f2,px -r %t/write.bc,_g,p \
+; RUN:  -codegen-data-generate=true
+; RUN: llvm-cgdata --merge %t/write.1 -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we execute either ThinLTO or LTO by reading the cgdata for local-two-another.ll.
+; With ThinLTO, similar to the no-LTO scenario shown in cgdata-read-double-outline.ll,
+; it optimistically outlines each instruction sequence that matches against
+; the global outlined hash tree. Since each matching sequence is considered a candidate,
+; we expect to generate two unique outlined functions that will be folded
+; by the linker at a later stage.
+; However, with LTO, we do not utilize the cgdata, but instead fall back to the default
+; outliner mode. This results in a single outlined function that is
+; shared across two call-sites.
+
+; Run ThinLTO
+; RUN: opt -module-summary %t/local-two-another.ll -o %t/thinlto.bc
+; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto \
+; RUN:  -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \
+; RUN:  -codegen-data-use-path=%t_cgdata
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+; Run ThinLTO while disabling the global outliner.
+; We have a single outlined case with the default outliner.
+; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto-disable \
+; RUN:  -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \
+; RUN:  -enable-machine-outliner \
+; RUN:  -codegen-data-use-path=%t_cgdata \
+; RUN:  -disable-global-outlining
+; RUN: llvm-objdump -d %t/thinlto-disable.1 | FileCheck %s --check-prefix=DISABLE
+
+; DISABLE: _OUTLINED_FUNCTION_{{.*}}:
+; DISABLE-NEXT:  mov
+; DISABLE-NEXT:  mov
+; DISABLE-NEXT:  b
+; DISABLE-NOT: _OUTLINED_FUNCTION_{{.*}}:
+
+; Run LTO, which effectively disables the global outliner.
+; RUN: opt %t/local-two-another.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/lto.bc -o %t/lto \
+; RUN:  -r %t/lto.bc,_f3,px -r %t/lto.bc,_f4,px -r %t/lto.bc,_g,p \
+; RUN:  -enable-machine-outliner \
+; RUN:  -codegen-data-use-path=%t_cgdata
+; RUN: llvm-objdump -d %t/lto.0 | FileCheck %s --check-prefix=DISABLE
+
+;--- local-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-two-another.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
new file mode 100644
index 00000000000000..affeea8c71acd3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
@@ -0,0 +1,68 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+; When multiple matches occur, we prioritize the candidates using the global frequency.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from local outline instances present in write1.ll and write2.ll
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/write1.ll -o %t_write1
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/write2.ll -o %t_write2
+; RUN: llvm-cgdata --merge %t_write1 %t_write2 -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 8
+; SHOW-NEXT:  Terminal Node Count: 2
+; SHOW-NEXT:  Depth: 4
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in read.ll that matches against the cgdata.
+; There are two matches -- (1) (mov #1, mov #2, mov #3, b) and (2) (mov #2, mov #3, b).
+; Even though sequence (1) is longer than sequence (2), the latter is outlined because it occurs more frequently in the outlined hash tree.
+
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/read.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- write1.ll
+; The sequence (mov #2, mov #3, b) are repeated 4 times.
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 50, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 60, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 70, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 80, i32 2, i32 3);
+  ret i32 %1
+}
+
+;--- write2.ll
+; The sequence (mov #1, mov #2, mov #3, b) are repeated 2 times.
+declare i32 @g(i32, i32, i32)
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f7() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
+
+;--- read.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-single-outline-suffix.ll b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline-suffix.ll
new file mode 100644
index 00000000000000..11bee02c0be5ca
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline-suffix.ll
@@ -0,0 +1,100 @@
+; This test checks if a singleton instance (an instance that appears only once) can be outlined
+; using codegen data from a previous codegen run.
+; Unlike cgdata-read-single-outline.ll, this test also examines various suffixes that LLVM appends to names.
+; Specifically, we aim to disregard the suffixes `.llvm.{number}` and `.__uniq.{number}` during the matching of call targets in hash computations.
+; This approach helps in accurately identifying the original call target, especially when an LTO build may append additional suffixes for uniqueness.
+; Conversely, we only consider the number from the suffix `.content.{number}`.
+; This matching strategy is crucial for recursively finding outlining candidates when multiple outliner runs are enabled.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata --merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in local-one.ll that matches against the cgdata.
+; We outline instances while disregarding the suffixes `.llvm.{number}` or `.__uniq.{number}` in names.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one-ignore-suffix-1.ll -o %t_read_ignore_1
+; RUN: llvm-objdump -d %t_read_ignore_1 | FileCheck %s
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one-ignore-suffix-2.ll -o %t_read_ignore_2
+; RUN: llvm-objdump -d %t_read_ignore_2 | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+; We don't ignore `.invalid.{number}`. So no outlining occurs.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one-no-ignore-suffix.ll -o %t_read_no_ignore
+; RUN: llvm-objdump -d %t_read_no_ignore | FileCheck %s --check-prefix=NOOUTLINE
+
+; NOOUTLINE-NOT: _OUTLINED_FUNCTION
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one-ignore-suffix-1.ll
+declare i32 @g.llvm.123(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g.llvm.123(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- local-one-ignore-suffix-2.ll
+declare i32 @g.__uniq.456(i32, i32, i32)
+define i32 @f4() minsize {
+  %1 = call i32 @g.__uniq.456(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- local-one-no-ignore-suffix.ll
+declare i32 @g.invalid.789(i32, i32, i32)
+define i32 @f5() minsize {
+  %1 = call i32 @g.invalid.789(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+; Similarly, we outline functions that have already been processed in previous outliner runs.
+; Assuming `-machine-outliner-reruns` is locally enabled, we might already have `OUTLINED_FUNCTION*` instances.
+; First, we generate the cgdata file from a local outline instance found in local-two-content.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-two-content.ll -o %t_write_content
+; RUN: llvm-cgdata --merge %t_write_content -o %t_cgdata_content
+; RUN: llvm-cgdata --show %t_cgdata_content | FileCheck %s --check-prefix=SHOW
+
+; Despite the target function names being different -- `OUTLINED_FUNCTION_0.content.123` vs. `OUTLINED_FUNCTION_1.content.123`,
+; We compute the same hash based on the suffix `.content.{number}`, and optimistically outline them.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata_content -filetype=obj %t/local-one-content.ll -o %t_read_content
+; RUN: llvm-objdump -d %t_read_content | FileCheck %s
+
+;--- local-two-content.ll
+declare i32 @OUTLINED_FUNCTION_0.content.123(i32, i32, i32)
+define i32 @f6() minsize {
+  %1 = call i32 @OUTLINED_FUNCTION_0.content.123(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f7() minsize {
+  %1 = call i32 @OUTLINED_FUNCTION_0.content.123(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one-content.ll
+declare i32 @OUTLINED_FUNCTION_1.content.123(i32, i32, i32)
+define i32 @f8() minsize {
+  %1 = call i32 @OUTLINED_FUNCTION_1.content.123(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
new file mode 100644
index 00000000000000..7725648a6bc3d5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
@@ -0,0 +1,42 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata --merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in local-one.ll that matches against the cgdata.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll
new file mode 100644
index 00000000000000..09ad499190ee37
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll
@@ -0,0 +1,51 @@
+; This test verifies whether an outlined function is encoded into the __llvm_outline section
+; when the -codegen-data-generate flag is used.
+
+; Verify whether an outlined function is always created, but only encoded into the section when the flag is used.
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=true -filetype=obj %s -o %t_save
+; RUN: llvm-objdump -d %t_save | FileCheck %s
+; RUN: llvm-objdump -h %t_save | FileCheck %s --check-prefix=SECTNAME
+; RUN: llc -mtriple=arm64-apple-darwin -enable-machine-outliner -codegen-data-generate=false -filetype=obj %s -o %t_nosave
+; RUN: llvm-objdump -d  %t_nosave | FileCheck %s
+; RUN: llvm-objdump -h %t_nosave | FileCheck %s --check-prefix=NOSECTNAME
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+; SECTNAME: __llvm_outline
+; NOSECTNAME-NOT: __llvm_outline
+
+; Verify the content of cgdata after it has been processed with llvm-cgdata.
+; RUN: llvm-cgdata --merge %t_save -o %t_cgdata
+; RUN: llvm-cgdata --convert %t_cgdata | FileCheck %s --check-prefix=TREE
+
+; TREE: :outlined_hash_tree
+; TREE: ---
+; TREE-NEXT: 0:
+; TREE-NEXT:   Hash:            0x0
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 1 ]
+; TREE-NEXT: 1:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 2 ]
+; TREE-NEXT: 2:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 3 ]
+; TREE-NEXT: 3:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       2
+; TREE-NEXT:   SuccessorIds:    [  ]
+; TREE-NEXT: ...
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 5d14d14d216244..dcbcab8ef78ff6 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -20,6 +20,7 @@
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
+; CHECK-NEXT: Module summary info
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
diff --git a/llvm/unittests/MIR/MachineStableHashTest.cpp b/llvm/unittests/MIR/MachineStableHashTest.cpp
index c6b99123d4bd2a..1d888c2ec3e72b 100644
--- a/llvm/unittests/MIR/MachineStableHashTest.cpp
+++ b/llvm/unittests/MIR/MachineStableHashTest.cpp
@@ -141,3 +141,73 @@ body:             |
   // Do not ignore `.invalid.{number}`.
   EXPECT_NE(stableHashValue(*MF1), stableHashValue(*MF4));
 }
+
+TEST_F(MachineStableHashTest, ContentName) {
+  auto TM = createTargetMachine(("aarch64--"), "", "");
+  if (!TM)
+    GTEST_SKIP();
+  StringRef MIRString = R"MIR(
+--- |
+  define void @f1() { ret void }
+  define void @f2() { ret void }
+  define void @f3() { ret void }
+  define void @f4() { ret void }
+  declare void @goo()
+  declare void @goo.content.123()
+  declare void @zoo.content.123()
+  declare void @goo.content.456()
+...
+---
+name:            f1
+alignment:       16
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    16
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+  liveins: $lr
+    BL @goo
+  RET undef $lr
+...
+---
+name:            f2
+body:             |
+  bb.0:
+  liveins: $lr
+    BL @goo.content.123
+  RET undef $lr
+...
+---
+name:            f3
+body:             |
+  bb.0:
+  liveins: $lr
+    BL @zoo.content.123
+  RET undef $lr
+...
+---
+name:            f4
+body:             |
+  bb.0:
+  liveins: $lr
+    BL @goo.content.456
+  RET undef $lr
+...
+)MIR";
+  MachineModuleInfo MMI(TM.get());
+  M = parseMIR(*TM, MIRString, MMI);
+  ASSERT_TRUE(M);
+  auto *MF1 = MMI.getMachineFunction(*M->getFunction("f1"));
+  auto *MF2 = MMI.getMachineFunction(*M->getFunction("f2"));
+  auto *MF3 = MMI.getMachineFunction(*M->getFunction("f3"));
+  auto *MF4 = MMI.getMachineFunction(*M->getFunction("f4"));
+
+  // Do not ignore `.content.{number}`.
+  EXPECT_NE(stableHashValue(*MF1), stableHashValue(*MF2));
+  EXPECT_EQ(stableHashValue(*MF2), stableHashValue(*MF3))
+      << "Expect the same hash for the same suffix, `.content.{number}`";
+  // Different suffixes should result in different hashes.
+  EXPECT_NE(stableHashValue(*MF2), stableHashValue(*MF4));
+  EXPECT_NE(stableHashValue(*MF3), stableHashValue(*MF4));
+}

>From 896d07e38b7fc3a9633d2628c0f1b35b070001fb Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Wed, 28 Aug 2024 10:19:05 -0700
Subject: [PATCH 2/6] Address comments from aemerson

- Capitalize the comments
- Use Mapper.LegalInstrNumber
- Remove dead code (Sequence)
---
 llvm/lib/CodeGen/MachineOutliner.cpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 7736df2def77bc..cfabc6c6edc509 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -664,26 +664,23 @@ static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
   auto &UnsignedVec = Mapper.UnsignedVec;
 
   std::vector<MatchedEntry> MatchedEntries;
-  std::vector<stable_hash> Sequence;
   auto Size = UnsignedVec.size();
 
   // Get the global outlined hash tree built from the previous run.
   assert(cgdata::hasOutlinedHashTree());
   const auto *RootNode = cgdata::getOutlinedHashTree()->getRoot();
   for (size_t I = 0; I < Size; ++I) {
-    // skip the invalid mapping that represents a large negative value.
-    if (UnsignedVec[I] >= Size)
+    // Skip the invalid mapping.
+    if (UnsignedVec[I] >= Mapper.LegalInstrNumber)
       continue;
     const MachineInstr &MI = *InstrList[I];
-    // skip debug instructions as we did for the outlined function.
+    // Skip debug instructions as we did for the outlined function.
     if (MI.isDebugInstr())
       continue;
-    // skip the empty hash value.
+    // Skip the empty hash value.
     stable_hash StableHashI = stableHashValue(MI);
     if (!StableHashI)
       continue;
-    Sequence.clear();
-    Sequence.push_back(StableHashI);
 
     const HashNode *LastNode = followHashNode(StableHashI, RootNode);
     if (!LastNode)
@@ -691,14 +688,14 @@ static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
 
     size_t J = I + 1;
     for (; J < Size; ++J) {
-      // break on the invalid mapping that represents a large negative value.
-      if (UnsignedVec[J] >= Size)
+      // Break on the invalid mapping.
+      if (UnsignedVec[J] >= Mapper.LegalInstrNumber)
         break;
-      // ignore debug instructions as we did for the outlined function.
+      // Skip debug instructions as we did for the outlined function.
       const MachineInstr &MJ = *InstrList[J];
       if (MJ.isDebugInstr())
         continue;
-      // break on the empty hash value.
+      // Break on the empty hash value.
       stable_hash StableHashJ = stableHashValue(MJ);
       if (!StableHashJ)
         break;
@@ -708,7 +705,6 @@ static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
 
       // Even with a match ending with a terminal, we continue finding
       // matches to populate all candidates.
-      Sequence.push_back(StableHashJ);
       auto Count = LastNode->Terminals;
       if (Count)
         MatchedEntries.push_back({I, J - I + 1, *Count});

>From 86c98a16810f821fac6f43e257d2933aba112a67 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Thu, 25 Apr 2024 22:20:48 -0700
Subject: [PATCH 3/6] [CGData] LLD for MachO

---
 lld/MachO/Config.h               |  1 +
 lld/MachO/Driver.cpp             | 39 ++++++++++++++
 lld/MachO/InputSection.h         |  1 +
 lld/MachO/Options.td             |  2 +
 lld/test/MachO/cgdata-generate.s | 92 ++++++++++++++++++++++++++++++++
 5 files changed, 135 insertions(+)
 create mode 100644 lld/test/MachO/cgdata-generate.s

diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 5beb0662ba7274..d87a93d861b59c 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -209,6 +209,7 @@ struct Configuration {
   std::vector<SectionAlign> sectionAlignments;
   std::vector<SegmentProtection> segmentProtections;
   bool ltoDebugPassManager = false;
+  llvm::StringRef codegenDataGeneratePath;
   bool csProfileGenerate = false;
   llvm::StringRef csProfilePath;
   bool pgoWarnMismatch;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 6a1ff96ed65697..3c2a121e49531e 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/BinaryFormat/Magic.h"
+#include "llvm/CGData/CodeGenDataWriter.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Object/Archive.h"
@@ -1316,6 +1317,38 @@ static void gatherInputSections() {
   }
 }
 
+static void codegenDataGenerate() {
+  TimeTraceScope timeScope("Generating codegen data");
+
+  OutlinedHashTreeRecord globalOutlineRecord;
+  for (ConcatInputSection *isec : inputSections) {
+    if (isec->getSegName() == segment_names::data &&
+        isec->getName() == section_names::outlinedHashTree) {
+      // Read outlined hash tree from each section
+      OutlinedHashTreeRecord localOutlineRecord;
+      auto *data = isec->data.data();
+      localOutlineRecord.deserialize(data);
+
+      // Merge it to the global hash tree.
+      globalOutlineRecord.merge(localOutlineRecord);
+    }
+  }
+
+  CodeGenDataWriter Writer;
+  if (!globalOutlineRecord.empty())
+    Writer.addRecord(globalOutlineRecord);
+
+  std::error_code EC;
+  auto fileName = config->codegenDataGeneratePath;
+  assert(!fileName.empty());
+  raw_fd_ostream Output(fileName, EC, sys::fs::OF_None);
+  if (EC)
+    error("fail to create raw_fd_ostream");
+
+  if (auto E = Writer.write(Output))
+    error("fail to write CGData");
+}
+
 static void foldIdenticalLiterals() {
   TimeTraceScope timeScope("Fold identical literals");
   // We always create a cStringSection, regardless of whether dedupLiterals is
@@ -1753,6 +1786,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     config->ignoreAutoLinkOptions.insert(arg->getValue());
   config->strictAutoLink = args.hasArg(OPT_strict_auto_link);
   config->ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager);
+  config->codegenDataGeneratePath =
+      args.getLastArgValue(OPT_codegen_data_generate_path);
   config->csProfileGenerate = args.hasArg(OPT_cs_profile_generate);
   config->csProfilePath = args.getLastArgValue(OPT_cs_profile_path);
   config->pgoWarnMismatch =
@@ -2078,6 +2113,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     }
 
     gatherInputSections();
+
+    if (!config->codegenDataGeneratePath.empty())
+      codegenDataGenerate();
+
     if (config->callGraphProfileSort)
       priorityBuilder.extractCallGraphProfile();
 
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index 0f389e50425a32..bb41cc9119aab4 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -353,6 +353,7 @@ constexpr const char objcMethname[] = "__objc_methname";
 constexpr const char objcNonLazyCatList[] = "__objc_nlcatlist";
 constexpr const char objcNonLazyClassList[] = "__objc_nlclslist";
 constexpr const char objcProtoList[] = "__objc_protolist";
+constexpr const char outlinedHashTree[] = "__llvm_outline";
 constexpr const char pageZero[] = "__pagezero";
 constexpr const char pointers[] = "__pointers";
 constexpr const char rebase[] = "__rebase";
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 9c9570cdbeb05c..1a2406f1567546 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -157,6 +157,8 @@ def no_objc_category_merging : Flag<["-"], "no_objc_category_merging">,
     Group<grp_lld>;
 def lto_debug_pass_manager: Flag<["--"], "lto-debug-pass-manager">,
     HelpText<"Debug new pass manager">, Group<grp_lld>;
+def codegen_data_generate_path : Joined<["--"], "codegen-data-generate-path=">,
+    HelpText<"Codegen data file path">, Group<grp_lld>;
 def cs_profile_generate: Flag<["--"], "cs-profile-generate">,
     HelpText<"Perform context sensitive PGO instrumentation">, Group<grp_lld>;
 def cs_profile_path: Joined<["--"], "cs-profile-path=">,
diff --git a/lld/test/MachO/cgdata-generate.s b/lld/test/MachO/cgdata-generate.s
new file mode 100644
index 00000000000000..3726efa42e98bd
--- /dev/null
+++ b/lld/test/MachO/cgdata-generate.s
@@ -0,0 +1,92 @@
+# REQUIRES: aarch64
+
+# RUN: rm -rf %t; split-file %s %t
+
+# RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+# RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/ \+/ /g; s/^ *//; s/ *$//; s/ /,0x/g; s/^/0x/' > %t/raw-1-bytes.txt
+# RUN: sed "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1-template.s > %t/merge-1.s
+# RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+# RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/ \+/ /g; s/^ *//; s/ *$//; s/ /,0x/g; s/^/0x/' > %t/raw-2-bytes.txt
+# RUN: sed "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2-template.s > %t/merge-2.s
+
+# RUN: llvm-mc -filetype obj -triple arm64-apple-darwin %t/merge-1.s -o %t/merge-1.o
+# RUN: llvm-mc -filetype obj -triple arm64-apple-darwin %t/merge-2.s -o %t/merge-2.o
+# RUN: llvm-mc -filetype obj -triple arm64-apple-darwin %t/main.s -o %t/main.o
+
+# This checks if the codegen data from the linker is identical to the merged codegen data
+# from each object file, which is obtained using the llvm-cgdata tool.
+# RUN: %no-arg-lld -dylib -arch arm64 -platform_version ios 14.0 15.0 -o %t/out \
+# RUN: %t/merge-1.o %t/merge-2.o %t/main.o --codegen-data-generate-path=%t/out-cgdata
+# RUN: llvm-cgdata --merge %t/merge-1.o %t/merge-2.o %t/main.o -o %t/merge-cgdata
+# RUN: diff %t/out-cgdata %t/merge-cgdata
+
+# Merge order doesn't matter. `main.o` is dropped due to missing __llvm_outline.
+# RUN: llvm-cgdata --merge %t/merge-2.o %t/merge-1.o -o %t/merge-cgdata-shuffle
+# RUN: diff %t/out-cgdata %t/merge-cgdata-shuffle
+
+# We can also generate the merged codegen data from the executable that is not dead-stripped.
+# RUN: llvm-objdump -h %t/out| FileCheck %s
+CHECK: __llvm_outline
+# RUN: llvm-cgdata --merge %t/out -o %t/merge-cgdata-exe
+# RUN: diff %t/merge-cgdata-exe %t/merge-cgdata
+
+# Dead-strip will remove __llvm_outline sections from the final executable.
+# But the codeden data is still correctly produced from the linker.
+# RUN: %no-arg-lld -dylib -arch arm64 -platform_version ios 14.0 15.0 -o %t/out-strip \
+# RUN: %t/merge-1.o %t/merge-2.o %t/main.o -dead_strip --codegen-data-generate-path=%t/out-cgdata-strip
+# RUN: llvm-cgdata --merge %t/merge-1.o %t/merge-2.o %t/main.o -o %t/merge-cgdata-strip
+# RUN: diff %t/out-cgdata-strip %t/merge-cgdata-strip
+# RUN: diff %t/out-cgdata-strip %t/merge-cgdata
+
+# Ensure no __llvm_outline section remains in the executable.
+# RUN: llvm-objdump -h %t/out-strip | FileCheck %s --check-prefix=STRIP
+STRIP-NOT: __llvm_outline
+
+#--- raw-1.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
+
+#--- merge-1-template.s
+.section __DATA,__llvm_outline
+_data:
+.byte <RAW_1_BYTES>
+
+#--- raw-2.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x3
+  Terminals:       5
+  SuccessorIds:    [  ]
+...
+
+#--- merge-2-template.s
+.section __DATA,__llvm_outline
+_data:
+.byte <RAW_2_BYTES>
+
+#--- main.s
+.globl _main
+
+.text
+_main:
+  ret

>From beb010d7c17d294bf87e52e1a5ac5ff561d8d8da Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 26 Apr 2024 12:58:54 -0700
Subject: [PATCH 4/6] [CGData] Clang Optinos

---
 clang/include/clang/Driver/Options.td      | 12 ++++++
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 27 +++++++++++++
 clang/lib/Driver/ToolChains/Darwin.cpp     | 46 ++++++++++++++++++++++
 clang/test/Driver/codegen-data.c           | 42 ++++++++++++++++++++
 4 files changed, 127 insertions(+)
 create mode 100644 clang/test/Driver/codegen-data.c

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 83cf753e824845..836a9949bf51c3 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1893,6 +1893,18 @@ def fprofile_selected_function_group :
   Visibility<[ClangOption, CC1Option]>, MetaVarName<"<i>">,
   HelpText<"Partition functions into N groups using -fprofile-function-groups and select only functions in group i to be instrumented. The valid range is 0 to N-1 inclusive">,
   MarshallingInfoInt<CodeGenOpts<"ProfileSelectedFunctionGroup">>;
+def fcodegen_data_generate : Joined<["-"], "fcodegen-data-generate">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+    HelpText<"Emit codegen data into object file. LLD for MachO (for now) merges them into default.cgdata">;
+def fcodegen_data_generate_EQ : Joined<["-"], "fcodegen-data-generate=">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
+    HelpText<"Emit codegen data into object file. LLD for MachO (for now) merges them into <directory>/default.cgdata">;
+def fcodegen_data_use : Joined<["-"], "fcodegen-data-use">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+    HelpText<"Use codegen data read from default.cgdata to optimize the binary">;
+def fcodegen_data_use_EQ : Joined<["-"], "fcodegen-data-use=">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
+    HelpText<"Use codegen data read from <directory>/default.cgdata to optimize the binary">;
 def fswift_async_fp_EQ : Joined<["-"], "fswift-async-fp=">,
     Group<f_Group>,
     Visibility<[ClangOption, CC1Option, CC1AsOption, CLOption]>,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 0601016c3b14b8..41ff6ec1390e7a 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2753,6 +2753,33 @@ void tools::addMachineOutlinerArgs(const Driver &D,
       addArg(Twine("-enable-machine-outliner=never"));
     }
   }
+
+  auto *CodeGenDataGenArg =
+      Args.getLastArg(options::OPT_fcodegen_data_generate,
+                      options::OPT_fcodegen_data_generate_EQ);
+  auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
+                                            options::OPT_fcodegen_data_use_EQ);
+
+  // We only allow one of them to be specified.
+  if (CodeGenDataGenArg && CodeGenDataUseArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataUseArg->getAsString(Args);
+
+  // For codegen data gen, the output file is passed to the linker
+  // while a boolean flag is passed to the LLVM backend.
+  if (CodeGenDataGenArg)
+    addArg(Twine("-codegen-data-generate"));
+
+  // For codegen data use, the input file is passed to the LLVM backend.
+  if (CodeGenDataUseArg) {
+    SmallString<128> Path(CodeGenDataUseArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataUseArg->getValue());
+    if (Path.empty() || llvm::sys::fs::is_directory(Path))
+      llvm::sys::path::append(Path, "default.cgdata");
+    addArg(Twine("-codegen-data-use-path=" + Path.str()));
+  }
 }
 
 void tools::addOpenMPDeviceRTL(const Driver &D,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 5e7f9290e2009d..9e72e280109640 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -476,6 +476,19 @@ void darwin::Linker::AddLinkArgs(Compilation &C, const ArgList &Args,
         llvm::sys::path::append(Path, "default.profdata");
       CmdArgs.push_back(Args.MakeArgString(Twine("--cs-profile-path=") + Path));
     }
+
+    auto *CodeGenDataGenArg =
+        Args.getLastArg(options::OPT_fcodegen_data_generate,
+                        options::OPT_fcodegen_data_generate_EQ);
+    if (CodeGenDataGenArg) {
+      SmallString<128> Path(CodeGenDataGenArg->getNumValues() == 0
+                                ? ""
+                                : CodeGenDataGenArg->getValue());
+      if (Path.empty() || llvm::sys::fs::is_directory(Path))
+        llvm::sys::path::append(Path, "default.cgdata");
+      CmdArgs.push_back(
+          Args.MakeArgString(Twine("--codegen-data-generate-path=") + Path));
+    }
   }
 }
 
@@ -633,6 +646,39 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-mllvm");
   CmdArgs.push_back("-enable-linkonceodr-outlining");
 
+  // Propagate codegen data flags to the linker for the LLVM backend.
+  auto *CodeGenDataGenArg =
+      Args.getLastArg(options::OPT_fcodegen_data_generate,
+                      options::OPT_fcodegen_data_generate_EQ);
+  auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
+                                            options::OPT_fcodegen_data_use_EQ);
+
+  // We only allow one of them to be specified.
+  const Driver &D = getToolChain().getDriver();
+  if (CodeGenDataGenArg && CodeGenDataUseArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataUseArg->getAsString(Args);
+
+  // For codegen data gen, the output file is passed to the linker
+  // while a boolean flag is passed to the LLVM backend.
+  if (CodeGenDataGenArg) {
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back("-codegen-data-generate");
+  }
+
+  // For codegen data use, the input file is passed to the LLVM backend.
+  if (CodeGenDataUseArg) {
+    SmallString<128> Path(CodeGenDataUseArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataUseArg->getValue());
+    if (Path.empty() || llvm::sys::fs::is_directory(Path))
+      llvm::sys::path::append(Path, "default.cgdata");
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back(
+        Args.MakeArgString("-codegen-data-use-path=" + Path.str()));
+  }
+
   // Setup statistics file output.
   SmallString<128> StatsFile =
       getStatsFileName(Args, Output, Inputs[0], getToolChain().getDriver());
diff --git a/clang/test/Driver/codegen-data.c b/clang/test/Driver/codegen-data.c
new file mode 100644
index 00000000000000..a72850afc59736
--- /dev/null
+++ b/clang/test/Driver/codegen-data.c
@@ -0,0 +1,42 @@
+// Verify only one of codegen-data flag is passed.
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
+// RUN: not %clang -### -S --target=arm64-apple-darwin  -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
+// CONFLICT: error: invalid argument '-fcodegen-data-generate' not allowed with '-fcodegen-data-use'
+
+// Verify the codegen-data-generate (boolean) flag is passed to LLVM
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate %s  2>&1| FileCheck %s --check-prefix=GENERATE
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-generate %s 2>&1| FileCheck %s --check-prefix=GENERATE
+// GENERATE: "-mllvm" "-codegen-data-generate"
+
+// Verify the codegen-data-use-path flag (with a default value) is passed to LLVM.
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use %s 2>&1| FileCheck %s --check-prefix=USE
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use %s 2>&1| FileCheck %s --check-prefix=USE
+// RUN: mkdir -p %t.d/some/dir
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use=%t.d/some/dir %s 2>&1 | FileCheck %s --check-prefix=USE-DIR
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use=%t.d/some/dir %s 2>&1 | FileCheck %s --check-prefix=USE-DIR
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use=file %s 2>&1 | FileCheck %s --check-prefix=USE-FILE
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use=file %s 2>&1 | FileCheck %s --check-prefix=USE-FILE
+// USE: "-mllvm" "-codegen-data-use-path=default.cgdata"
+// USE-DIR: "-mllvm" "-codegen-data-use-path={{.*}}.d/some/dir{{/|\\\\}}default.cgdata"
+// USE-FILE: "-mllvm" "-codegen-data-use-path=file"
+
+// Verify the codegen-data-generate (boolean) flag with a LTO.
+// RUN: %clang -### -flto --target=aarch64-linux-gnu -fcodegen-data-generate %s 2>&1 | FileCheck %s --check-prefix=GENERATE-LTO
+// GENERATE-LTO: {{ld(.exe)?"}}
+// GENERATE-LTO-SAME: "-plugin-opt=-codegen-data-generate"
+// RUN: %clang -### -flto --target=arm64-apple-darwin -fcodegen-data-generate %s 2>&1 | FileCheck %s --check-prefix=GENERATE-LTO-DARWIN
+// GENERATE-LTO-DARWIN: {{ld(.exe)?"}}
+// GENERATE-LTO-DARWIN-SAME: "-mllvm" "-codegen-data-generate"
+
+// Verify the codegen-data-use-path flag with a LTO is passed to LLVM.
+// RUN: %clang -### -flto=thin --target=aarch64-linux-gnu -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=USE-LTO
+// USE-LTO: {{ld(.exe)?"}}
+// USE-LTO-SAME: "-plugin-opt=-codegen-data-use-path=default.cgdata"
+// RUN: %clang -### -flto=thin --target=arm64-apple-darwin -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=USE-LTO-DARWIN
+// USE-LTO-DARWIN: {{ld(.exe)?"}}
+// USE-LTO-DARWIN-SAME: "-mllvm" "-codegen-data-use-path=default.cgdata"
+
+// For now, LLD MachO supports for generating the codegen data at link time.
+// RUN: %clang -### -fuse-ld=lld -B%S/Inputs/lld --target=arm64-apple-darwin -fcodegen-data-generate %s 2>&1 | FileCheck %s --check-prefix=GENERATE-LLD-DARWIN
+// GENERATE-LLD-DARWIN: {{ld(.exe)?"}}
+// GENERATE-LLD-DARWIN-SAME: "--codegen-data-generate-path=default.cgdata"

>From 3e0b91d7e5cc851b258bda130d267937a627e377 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 26 Apr 2024 20:02:52 -0700
Subject: [PATCH 5/6] [ThinLTO][NFC] Prep for two-codegen rounds

---
 clang/lib/CodeGen/BackendUtil.cpp  |  8 ++--
 llvm/include/llvm/LTO/LTOBackend.h |  1 +
 llvm/lib/LTO/LTO.cpp               | 77 ++++++++++++++++--------------
 llvm/lib/LTO/LTOBackend.cpp        |  4 +-
 4 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 026f16484c0949..5f37e019d0fd7e 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1283,10 +1283,10 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E =
-          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
-                      /* ModuleMap */ nullptr, CGOpts.CmdArgs)) {
+  if (Error E = thinBackend(
+          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+          ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+          /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index de89f4bb10dff2..8516398510d4b8 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> *ModuleMap,
+                  bool CodeGenOnly,
                   const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 09dfec03cb0c34..ab369fb87aa3ba 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1457,7 +1457,7 @@ class InProcessThinBackend : public ThinBackendProc {
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
   }
 
-  Error runThinLTOBackendThread(
+  virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
       ModuleSummaryIndex &CombinedIndex,
       const FunctionImporter::ImportMapTy &ImportList,
@@ -1472,7 +1472,8 @@ class InProcessThinBackend : public ThinBackendProc {
         return MOrErr.takeError();
 
       return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
-                         ImportList, DefinedGlobals, &ModuleMap);
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         Conf.CodeGenOnly);
     };
 
     auto ModuleID = BM.getModuleIdentifier();
@@ -1841,45 +1842,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   TimeTraceScopeExit.release();
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  auto ProcessOneModule = [&](int I) -> Error {
-    auto &Mod = *(ModuleMap.begin() + I);
-    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
-    // combined module and parallel code generation partitions.
-    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
-                              Mod.second, ImportLists[Mod.first],
-                              ExportLists[Mod.first], ResolvedODR[Mod.first],
-                              ThinLTO.ModuleMap);
+  auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error {
+    auto ProcessOneModule = [&](int I) -> Error {
+      auto &Mod = *(ModuleMap.begin() + I);
+      // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+      // combined module and parallel code generation partitions.
+      return BackendProcess->start(
+          RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second,
+          ImportLists[Mod.first], ExportLists[Mod.first],
+          ResolvedODR[Mod.first], ThinLTO.ModuleMap);
+    };
+
+    if (BackendProcess->getThreadCount() == 1) {
+      // Process the modules in the order they were provided on the
+      // command-line. It is important for this codepath to be used for
+      // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
+      // ThinLTO objects in the same order as the inputs, which otherwise would
+      // affect the final link order.
+      for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+        if (Error E = ProcessOneModule(I))
+          return E;
+    } else {
+      // When executing in parallel, process largest bitsize modules first to
+      // improve parallelism, and avoid starving the thread pool near the end.
+      // This saves about 15 sec on a 36-core machine while link `clang.exe`
+      // (out of 100 sec).
+      std::vector<BitcodeModule *> ModulesVec;
+      ModulesVec.reserve(ModuleMap.size());
+      for (auto &Mod : ModuleMap)
+        ModulesVec.push_back(&Mod.second);
+      for (int I : generateModulesOrdering(ModulesVec))
+        if (Error E = ProcessOneModule(I))
+          return E;
+    }
+    return BackendProcess->wait();
   };
 
-  if (BackendProc->getThreadCount() == 1) {
-    // Process the modules in the order they were provided on the command-line.
-    // It is important for this codepath to be used for WriteIndexesThinBackend,
-    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
-    // order as the inputs, which otherwise would affect the final link order.
-    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
-      if (Error E = ProcessOneModule(I))
-        return E;
-  } else {
-    // When executing in parallel, process largest bitsize modules first to
-    // improve parallelism, and avoid starving the thread pool near the end.
-    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
-    // of 100 sec).
-    std::vector<BitcodeModule *> ModulesVec;
-    ModulesVec.reserve(ModuleMap.size());
-    for (auto &Mod : ModuleMap)
-      ModulesVec.push_back(&Mod.second);
-    for (int I : generateModulesOrdering(ModulesVec))
-      if (Error E = ProcessOneModule(I))
-        return E;
-  }
-  return BackendProc->wait();
+  std::unique_ptr<ThinBackendProc> BackendProc =
+      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                      AddStream, Cache);
+  return RunBackends(BackendProc.get());
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 4e58cd369c3ac9..5e7c1b6f684fbf 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -565,7 +565,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
-                       const std::vector<uint8_t> &CmdArgs) {
+                       bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -586,7 +586,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   Mod.setPartialSampleProfileRatio(CombinedIndex);
 
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
-  if (Conf.CodeGenOnly) {
+  if (CodeGenOnly) {
     codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }

>From 62c6a717d8b935166dd2817dad8238bdcba3da76 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 27 Apr 2024 07:52:51 -0700
Subject: [PATCH 6/6] [CGData][ThinLTO] Global Outlining with Two-CodeGen
 Rounds

---
 clang/include/clang/Driver/Options.td         |   6 +
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  25 ++++
 clang/lib/Driver/ToolChains/Darwin.cpp        |  26 +++-
 clang/test/Driver/codegen-data.c              |  18 +++
 llvm/include/llvm/CGData/CodeGenData.h        |  13 ++
 llvm/lib/CGData/CodeGenData.cpp               |  69 ++++++++++-
 llvm/lib/LTO/LTO.cpp                          | 112 +++++++++++++++++-
 llvm/lib/LTO/LTOBackend.cpp                   |   9 ++
 .../AArch64/cgdata-read-single-outline.ll     |  42 +++++++
 .../test/ThinLTO/AArch64/cgdata-two-rounds.ll |  95 +++++++++++++++
 llvm/test/ThinLTO/AArch64/lit.local.cfg       |   2 +
 11 files changed, 411 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 836a9949bf51c3..0473adf435c881 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1905,6 +1905,12 @@ def fcodegen_data_use : Joined<["-"], "fcodegen-data-use">,
 def fcodegen_data_use_EQ : Joined<["-"], "fcodegen-data-use=">,
     Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
     HelpText<"Use codegen data read from <directory>/default.cgdata to optimize the binary">;
+def fcodegen_data_thinlto_two_rounds : Joined<["-"], "fcodegen-data-thinlto-two-rounds">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+    HelpText<"ThinLTO runs codegen twice by serializing and deserializing IRs to and from a temp directory. Applies to ThinLTO bitcodes only">;
+def fcodegen_data_thinlto_two_rounds_EQ : Joined<["-"], "fcodegen-data-thinlto-two-rounds=">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
+    HelpText<"ThinLTO runs codegen twice by serializing and deserializing IRs to and from <directory>. Applies to ThinLTO bitcodes only">;
 def fswift_async_fp_EQ : Joined<["-"], "fswift-async-fp=">,
     Group<f_Group>,
     Visibility<[ClangOption, CC1Option, CC1AsOption, CLOption]>,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 41ff6ec1390e7a..2fdafdf7761aab 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2759,12 +2759,23 @@ void tools::addMachineOutlinerArgs(const Driver &D,
                       options::OPT_fcodegen_data_generate_EQ);
   auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
                                             options::OPT_fcodegen_data_use_EQ);
+  auto *CodeGenDataTwoRoundsArg =
+      Args.getLastArg(options::OPT_fcodegen_data_thinlto_two_rounds,
+                      options::OPT_fcodegen_data_thinlto_two_rounds_EQ);
 
   // We only allow one of them to be specified.
   if (CodeGenDataGenArg && CodeGenDataUseArg)
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << CodeGenDataGenArg->getAsString(Args)
         << CodeGenDataUseArg->getAsString(Args);
+  if (CodeGenDataGenArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
+  if (CodeGenDataUseArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataUseArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
 
   // For codegen data gen, the output file is passed to the linker
   // while a boolean flag is passed to the LLVM backend.
@@ -2780,6 +2791,20 @@ void tools::addMachineOutlinerArgs(const Driver &D,
       llvm::sys::path::append(Path, "default.cgdata");
     addArg(Twine("-codegen-data-use-path=" + Path.str()));
   }
+
+  // For codegen data thinlto two rounds, the output directory needs to
+  // be passed. A temp directory is created if it does not exist.
+  // In fact, this flag is needed for the thinlto's link flag only.
+  if (CodeGenDataTwoRoundsArg) {
+    SmallString<128> Path(CodeGenDataTwoRoundsArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataTwoRoundsArg->getValue());
+    if (!Path.empty() && !llvm::sys::fs::is_directory(Path))
+      D.Diag(diag::err_drv_unable_to_set_working_directory) << Path.str();
+    if (Path.empty())
+      llvm::sys::fs::createUniqueDirectory("cgdata", Path);
+    addArg(Twine("-codegen-data-thinlto-two-rounds-path=" + Path.str()));
+  }
 }
 
 void tools::addOpenMPDeviceRTL(const Driver &D,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 9e72e280109640..c590effc543d17 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -652,6 +652,9 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                       options::OPT_fcodegen_data_generate_EQ);
   auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
                                             options::OPT_fcodegen_data_use_EQ);
+  auto *CodeGenDataTwoRoundsArg =
+      Args.getLastArg(options::OPT_fcodegen_data_thinlto_two_rounds,
+                      options::OPT_fcodegen_data_thinlto_two_rounds_EQ);
 
   // We only allow one of them to be specified.
   const Driver &D = getToolChain().getDriver();
@@ -659,9 +662,17 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << CodeGenDataGenArg->getAsString(Args)
         << CodeGenDataUseArg->getAsString(Args);
+  if (CodeGenDataGenArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
+  if (CodeGenDataUseArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataUseArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
 
   // For codegen data gen, the output file is passed to the linker
-  // while a boolean flag is passed to the LLVM backend.
+  // while a boolean flag is passed to the LTO backend.
   if (CodeGenDataGenArg) {
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-codegen-data-generate");
@@ -679,6 +690,19 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         Args.MakeArgString("-codegen-data-use-path=" + Path.str()));
   }
 
+  // For codegen data thinlto two rounds, the output directory needs to
+  // be passed. A temp directory is created if it does not exist.
+  if (CodeGenDataTwoRoundsArg) {
+    SmallString<128> Path(CodeGenDataTwoRoundsArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataTwoRoundsArg->getValue());
+    if (Path.empty())
+      llvm::sys::fs::createUniqueDirectory("cgdata", Path);
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back(Args.MakeArgString(
+        "-codegen-data-thinlto-two-rounds-path=" + Path.str()));
+  }
+
   // Setup statistics file output.
   SmallString<128> StatsFile =
       getStatsFileName(Args, Output, Inputs[0], getToolChain().getDriver());
diff --git a/clang/test/Driver/codegen-data.c b/clang/test/Driver/codegen-data.c
index a72850afc59736..ebfa19f1dcc927 100644
--- a/clang/test/Driver/codegen-data.c
+++ b/clang/test/Driver/codegen-data.c
@@ -2,6 +2,24 @@
 // RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
 // RUN: not %clang -### -S --target=arm64-apple-darwin  -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
 // CONFLICT: error: invalid argument '-fcodegen-data-generate' not allowed with '-fcodegen-data-use'
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-2
+// RUN: not %clang -### -S --target=arm64-apple-darwin -fcodegen-data-generate -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-2
+// CONFLICT-2: error: invalid argument '-fcodegen-data-generate' not allowed with '-fcodegen-data-thinlto-two-rounds'
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-3
+// RUN: not %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-3
+// CONFLICT-3: error: invalid argument '-fcodegen-data-use' not allowed with '-fcodegen-data-thinlto-two-rounds'
+
+// Verify the codegen-data-thinlto-two-rounds-path must have a valid directory path passed to LLVM.
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-thinlto-two-rounds=file %s 2>&1 | FileCheck %s --check-prefix=ROUND-FILE
+// RUN: not %clang -### -S --target=arm64-apple-darwin -fcodegen-data-thinlto-two-rounds=file %s 2>&1 | FileCheck %s --check-prefix=ROUND-FILE
+// ROUND-FILE: error: unable to set working directory: file
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIRTEMP
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIRTEMP
+// ROUND-DIRTEMP: "-mllvm" "-codegen-data-thinlto-two-rounds-path={{.*}}"
+// RUN: mkdir -p %t.d/some
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-thinlto-two-rounds=%t.d/some %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIR
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-thinlto-two-rounds=%t.d/some %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIR
+// ROUND-DIR: "-mllvm" "-codegen-data-thinlto-two-rounds-path={{.*}}.d/some"
 
 // Verify the codegen-data-generate (boolean) flag is passed to LLVM
 // RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate %s  2>&1| FileCheck %s --check-prefix=GENERATE
diff --git a/llvm/include/llvm/CGData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
index 84133a433170fe..d083d3b3dcd212 100644
--- a/llvm/include/llvm/CGData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -164,6 +164,19 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
+/// Save the current module before the first codegen round.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
+
+/// Load the current module  before the second codegen round.
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context);
+
+/// Merge the codegen data from the input files in scratch vector in ThinLTO
+/// two-codegen rounds.
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
+
 void warn(Error E, StringRef Whence = "");
 void warn(Twine Message, std::string Whence = "", std::string Hint = "");
 
diff --git a/llvm/lib/CGData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
index 55d2504231c744..1ddd63860459b5 100644
--- a/llvm/lib/CGData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -30,6 +30,10 @@ cl::opt<bool>
 cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+cl::opt<std::string> CodeGenDataThinLTOTwoRoundsPath(
+    "codegen-data-thinlto-two-rounds-path", cl::init(""), cl::Hidden,
+    cl::desc("Directory path to where the optimized bitcodes are saved and "
+             "restored."));
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
@@ -139,7 +143,7 @@ CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
     Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
-    if (CodeGenDataGenerate)
+    if (CodeGenDataGenerate || !CodeGenDataThinLTOTwoRoundsPath.empty())
       Instance->EmitCGData = true;
     else if (!CodeGenDataUsePath.empty()) {
       // Initialize the global CGData if the input file name is given.
@@ -215,6 +219,69 @@ void warn(Error E, StringRef Whence) {
   }
 }
 
+static std::string getPath(const std::string &Dir, unsigned Task) {
+  return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+}
+
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to save optimized bitcode\n");
+  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+}
+
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  auto FileOrError = MemoryBuffer::getFile(Path);
+  if (!FileOrError)
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to load optimized bitcode\n");
+
+  std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+  auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+  if (!RestoredModule)
+    report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
+                       Path + "\n");
+
+  // Restore the original module identifier.
+  (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
+  return std::move(*RestoredModule);
+}
+
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
+
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &InputFile : *(InputFiles)) {
+    if (InputFile.empty())
+      continue;
+    StringRef File = StringRef(InputFile.data(), InputFile.size());
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+        File, "in-memory object file", /*RequiresNullTerminator=*/false);
+    Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
+        object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
+    if (!BinOrErr)
+      return BinOrErr.takeError();
+
+    std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
+                                                        GlobalOutlineRecord))
+      return E;
+  }
+
+  if (!GlobalOutlineRecord.empty())
+    cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+
+  return Error::success();
+}
+
 } // end namespace cgdata
 
 } // end namespace llvm
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index ab369fb87aa3ba..775ee3b1c6823d 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AutoUpgrade.h"
@@ -72,6 +73,10 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+/// Path to where the optimized bitcodes are saved and restored for ThinLTO
+/// two-codegen rounds.
+extern cl::opt<std::string> CodeGenDataThinLTOTwoRoundsPath;
+
 namespace llvm {
 /// Enable global value internalization in LTO.
 cl::opt<bool> EnableLTOInternalization(
@@ -1558,6 +1563,66 @@ class InProcessThinBackend : public ThinBackendProc {
     return BackendThreadPool.getMaxConcurrency();
   }
 };
+
+// This Backend will run ThinBackend process but throw away all the output from
+// the codegen. This class facilitates the first codegen round.
+class NoOutputThinBackend : public InProcessThinBackend {
+public:
+  NoOutputThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
+      : InProcessThinBackend(
+            Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
+            // This lambda is the reason why Scratch is a unique_ptr that is
+            // constructed outside of this class's constructor. The Scratch
+            // space needs to be fully allocated so that its address does not
+            // change after we create this lambda, which depends on its address
+            // remaining the same.
+            // There may be a cleaner way to do this but this way seems to work.
+            [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
+              return std::make_unique<CachedFileStream>(
+                  std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
+            },
+            FileCache(), nullptr, false, false),
+        Scratch(std::move(Scratch)) {}
+
+  /// This vector is just scratch space where the output of the ThinBackend can
+  /// be written and then thrown away during destruction.
+  std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
+};
+
+// This Backend performs codegen on bitcode that was previously saved after
+// going through optimization. This class facilitates the second codegen round.
+class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+public:
+  OptimizedBitcodeThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn AddStream)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, AddStream, FileCache(),
+                             nullptr, false, false) {}
+
+  virtual Error runThinLTOBackendThread(
+      AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+      ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    LTOLLVMContext BackendContext(Conf);
+    std::unique_ptr<Module> LoadedModule =
+        cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
+
+    return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+                       ImportList, DefinedGlobals, &ModuleMap,
+                       /*CodeGenOnly*/ true);
+  }
+};
 } // end anonymous namespace
 
 ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1881,10 +1946,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     return BackendProcess->wait();
   };
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-  return RunBackends(BackendProc.get());
+  if (CodeGenDataThinLTOTwoRoundsPath.empty()) {
+    std::unique_ptr<ThinBackendProc> BackendProc =
+        ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                        AddStream, Cache);
+    return RunBackends(BackendProc.get());
+  }
+
+  // Two-codegen rounds:
+  // 1. The first round: Run opt + codegen with a scratch output.
+  // 2. Merge codegen data extracted from the scratch output.
+  // 3. The second round: Run codegen again.
+  LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
+
+  // Ensure we have a directory to write the bitcode files for two-codegen
+  // rounds.
+  if (auto EC = sys::fs::create_directories(CodeGenDataThinLTOTwoRoundsPath,
+                                            /*IgnoreExisting=*/true))
+    return errorCodeToError(EC);
+
+  // Create a scratch output.
+  auto Outputs = std::make_unique<std::vector<llvm::SmallString<0>>>();
+  Outputs->resize(getMaxTasks());
+  auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+      Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+      ModuleToDefinedGVSummaries, std::move(Outputs));
+  // The first round: Run opt + codegen with a scratch output.
+  // Before codegen, we serilized modules to CodeGenDataThinLTOTwoRoundsPath.
+  if (Error E = RunBackends(FirstRoundLTO.get()))
+    return E;
+
+  // Using the scratch output, we merge codegen data.
+  if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
+    return E;
+
+  // The second round: Run codegen by reading IRs from
+  // CodeGenDataThinLTOTwoRoundsPath.
+  std::unique_ptr<ThinBackendProc> SecondRoundLTO =
+      std::make_unique<OptimizedBitcodeThinBackend>(
+          Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+          ModuleToDefinedGVSummaries, AddStream);
+  Error E = RunBackends(SecondRoundLTO.get());
+
+  return E;
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 5e7c1b6f684fbf..f287e965e2b59e 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CGData/CodeGenData.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged(
     cl::desc("Assume the input has already undergone ThinLTO function "
              "importing and the other pre-optimization pipeline changes."));
 
+extern cl::opt<std::string> CodeGenDataThinLTOTwoRoundsPath;
+
 namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
@@ -602,6 +605,12 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                  CmdArgs))
           return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
+        // Save the current module before the first codegen round.
+        // Note the second codegen round has been already bailed out with
+        // CodeGenOnly.
+        if (!CodeGenDataThinLTOTwoRoundsPath.empty())
+          cgdata::saveModuleForTwoRounds(Mod, Task);
+
         codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       };
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll b/llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll
new file mode 100644
index 00000000000000..d5249555e58610
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll
@@ -0,0 +1,42 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata --merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata --show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in local-one.ll that matches against the cgdata.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
new file mode 100644
index 00000000000000..bb7292477ea250
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
@@ -0,0 +1,95 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; by running two codegen rounds.
+
+; RUN: split-file %s %t
+
+; Verify each outlining instance is singleton with the global outlining for thinlto.
+; They will be identical, which can be folded by the linker with ICF.
+; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc
+; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc
+; RUN: mkdir -p %t/two-rounds
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds-path=%t/two-rounds
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1
+; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  b
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+
+; Now add a lto module to the above thinlto modules.
+; Verify the lto module is optimized independent of the global outlining for thinlto.
+; RUN: opt %t/lto.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds-path=%t/two-rounds
+
+; lto.ll will have one outlining instance within the lto module itself (no global outlining).
+; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0
+; LTO-0: _OUTLINED_FUNCTION{{.*}}>:
+; LTO-0-NEXT:  mov
+; LTO-0-NEXT:  b
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2
+
+;--- thin-one.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- lto.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 10, i32 30, i32 2);
+  ret i32 %1
+}
+define i32 @f5() minsize {
+  %1 = call i32 @g(i32 20, i32 40, i32 2);
+  ret i32 %1
+}
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 50, i32 60, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg
new file mode 100644
index 00000000000000..10d4a0e953ed47
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True