[clang] [lld] [llvm] [ThinLTO][CGData] Global Outlining with Two-CodeGen Rounds (PR #90347)

Sat Apr 27 10:55:11 PDT 2024

https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/90347

>From e405e025a84e3d9d04384ace7a2cdab1f16d38d4 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Mon, 22 Apr 2024 15:29:25 -0700
Subject: [PATCH 1/9] [CGData][OutlinedHashTree] Define OutlinedHashTree

This defines the OutlinedHashTree class.
It contains sequences of stable hash values of instructions that have been outlined.
This OutlinedHashTree can be used to track the outlined instruction sequences across modules.
A trie structure is used in its implementation, allowing for a compact sharing of common prefixes.
---
 .../llvm/CodeGenData/OutlinedHashTree.h       | 131 ++++++++++++++
 .../llvm/CodeGenData/OutlinedHashTreeRecord.h |  67 +++++++
 llvm/lib/CMakeLists.txt                       |   1 +
 llvm/lib/CodeGenData/CMakeLists.txt           |  14 ++
 llvm/lib/CodeGenData/OutlinedHashTree.cpp     | 106 +++++++++++
 .../CodeGenData/OutlinedHashTreeRecord.cpp    | 166 ++++++++++++++++++
 llvm/unittests/CMakeLists.txt                 |   1 +
 llvm/unittests/CodeGenData/CMakeLists.txt     |  14 ++
 .../OutlinedHashTreeRecordTest.cpp            | 118 +++++++++++++
 .../CodeGenData/OutlinedHashTreeTest.cpp      |  81 +++++++++
 10 files changed, 699 insertions(+)
 create mode 100644 llvm/include/llvm/CodeGenData/OutlinedHashTree.h
 create mode 100644 llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
 create mode 100644 llvm/lib/CodeGenData/CMakeLists.txt
 create mode 100644 llvm/lib/CodeGenData/OutlinedHashTree.cpp
 create mode 100644 llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
 create mode 100644 llvm/unittests/CodeGenData/CMakeLists.txt
 create mode 100644 llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
 create mode 100644 llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp

diff --git a/llvm/include/llvm/CodeGenData/OutlinedHashTree.h b/llvm/include/llvm/CodeGenData/OutlinedHashTree.h
new file mode 100644
index 00000000000000..84fa8aba827207
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/OutlinedHashTree.h
@@ -0,0 +1,131 @@
+//===- OutlinedHashTree.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This defines the OutlinedHashTree class. It contains sequences of stable
+// hash values of instructions that have been outlined. This OutlinedHashTree
+// can be used to track the outlined instruction sequences across modules.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_OUTLINEDHASHTREE_H
+#define LLVM_CODEGENDATA_OUTLINEDHASHTREE_H
+
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+
+/// A HashNode is an entry in an OutlinedHashTree, holding a hash value
+/// and a collection of Successors (other HashNodes). If a HashNode has
+/// a positive terminal value (Terminals > 0), it signifies the end of
+/// a hash sequence with that occurrence count.
+struct HashNode {
+  /// The hash value of the node.
+  stable_hash Hash;
+  /// The number of terminals in the sequence ending at this node.
+  unsigned Terminals;
+  /// The successors of this node.
+  std::unordered_map<stable_hash, std::unique_ptr<HashNode>> Successors;
+};
+
+/// HashNodeStable is the serialized, stable, and compact representation
+/// of a HashNode.
+struct HashNodeStable {
+  llvm::yaml::Hex64 Hash;
+  unsigned Terminals;
+  std::vector<unsigned> SuccessorIds;
+};
+
+class OutlinedHashTree {
+
+  using EdgeCallbackFn =
+      std::function<void(const HashNode *, const HashNode *)>;
+  using NodeCallbackFn = std::function<void(const HashNode *)>;
+
+  using HashSequence = std::vector<stable_hash>;
+  using HashSequencePair = std::pair<std::vector<stable_hash>, unsigned>;
+
+  /// Walks every edge and node in the OutlinedHashTree and calls CallbackEdge
+  /// for the edges and CallbackNode for the nodes with the stable_hash for
+  /// the source and the stable_hash of the sink for an edge. These generic
+  /// callbacks can be used to traverse a OutlinedHashTree for the purpose of
+  /// print debugging or serializing it.
+  void walkGraph(EdgeCallbackFn CallbackEdge,
+                 NodeCallbackFn CallbackNode) const;
+
+public:
+  /// Walks the nodes of a OutlinedHashTree using walkGraph.
+  void walkVertices(NodeCallbackFn Callback) const {
+    walkGraph([](const HashNode *A, const HashNode *B) {}, Callback);
+  }
+
+  /// Release all hash nodes except the root hash node.
+  void clear() {
+    assert(getRoot()->Hash == 0 && getRoot()->Terminals == 0);
+    getRoot()->Successors.clear();
+  }
+
+  /// \returns true if the hash tree has only the root node.
+  bool empty() { return size() == 1; }
+
+  /// \returns the size of a OutlinedHashTree by traversing it. If
+  /// \p GetTerminalCountOnly is true, it only counts the terminal nodes
+  /// (meaning it returns the size of the number of hash sequences in a
+  /// OutlinedHashTree).
+  size_t size(bool GetTerminalCountOnly = false) const {
+    size_t Size = 0;
+    walkVertices([&Size, GetTerminalCountOnly](const HashNode *N) {
+      Size += (N && (!GetTerminalCountOnly || N->Terminals));
+    });
+    return Size;
+  }
+
+  /// \returns the depth of a OutlinedHashTree by traversing it.
+  size_t depth() const {
+    size_t Size = 0;
+    std::unordered_map<const HashNode *, size_t> DepthMap;
+
+    walkGraph(
+        [&DepthMap](const HashNode *Src, const HashNode *Dst) {
+          size_t Depth = DepthMap[Src];
+          DepthMap[Dst] = Depth + 1;
+        },
+        [&Size, &DepthMap](const HashNode *N) {
+          Size = std::max(Size, DepthMap[N]);
+        });
+
+    return Size;
+  }
+
+  /// \returns the root hash node of a OutlinedHashTree.
+  const HashNode *getRoot() const { return Root.get(); }
+  HashNode *getRoot() { return Root.get(); }
+
+  /// Inserts a \p Sequence into the this tree. The last node in the sequence
+  /// will increase Terminals.
+  void insert(const HashSequencePair &SequencePair);
+
+  /// Merge a \p OtherTree into this Tree.
+  void merge(const OutlinedHashTree *OtherTree);
+
+  /// \returns the matching count if \p Sequence exists in a OutlinedHashTree.
+  unsigned find(const HashSequence &Sequence) const;
+
+  OutlinedHashTree() { Root = std::make_unique<HashNode>(); }
+
+private:
+  std::unique_ptr<HashNode> Root;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h b/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
new file mode 100644
index 00000000000000..ccd2ad26dd0871
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
@@ -0,0 +1,67 @@
+//===- OutlinedHashTreeRecord.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This defines the OutlinedHashTreeRecord class. This class holds the outlined
+// hash tree for both serialization and deserialization processes. It utilizes
+// two data formats for serialization: raw binary data and YAML.
+// These two formats can be used interchangeably.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
+#define LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
+
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+
+namespace llvm {
+
+using IdHashNodeStableMapTy = std::map<unsigned, HashNodeStable>;
+using IdHashNodeMapTy = std::map<unsigned, HashNode *>;
+using HashNodeIdMapTy = std::unordered_map<const HashNode *, unsigned>;
+
+struct OutlinedHashTreeRecord {
+  std::unique_ptr<OutlinedHashTree> HashTree;
+
+  OutlinedHashTreeRecord() { HashTree = std::make_unique<OutlinedHashTree>(); }
+  OutlinedHashTreeRecord(std::unique_ptr<OutlinedHashTree> HashTree)
+      : HashTree(std::move(HashTree)){};
+
+  /// Serialize the outlined hash tree to a raw_ostream.
+  void serialize(raw_ostream &OS) const;
+  /// Deserialize the outlined hash tree from a raw_ostream.
+  void deserialize(const unsigned char *&Ptr);
+  /// Serialize the outlined hash tree to a YAML stream.
+  void serializeYAML(yaml::Output &YOS) const;
+  /// Deserialize the outlined hash tree from a YAML stream.
+  void deserializeYAML(yaml::Input &YIS);
+
+  /// Merge the other outlined hash tree into this one.
+  void merge(const OutlinedHashTreeRecord &Other) {
+    HashTree->merge(Other.HashTree.get());
+  }
+
+  /// \returns true if the outlined hash tree is empty.
+  bool empty() const { return HashTree->empty(); }
+
+  /// Print the outlined hash tree in a YAML format.
+  void print(raw_ostream &OS = llvm::errs()) const {
+    yaml::Output YOS(OS);
+    serializeYAML(YOS);
+  }
+
+private:
+  /// Convert the outlined hash tree to stable data.
+  void convertToStableData(IdHashNodeStableMapTy &IdNodeStableMap) const;
+
+  /// Convert the stable data back to the outlined hash tree.
+  void convertFromStableData(const IdHashNodeStableMapTy &IdNodeStableMap);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index 74e2d03c07953d..2ac0b0dc026e16 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(InterfaceStub)
 add_subdirectory(IRPrinter)
 add_subdirectory(IRReader)
 add_subdirectory(CodeGen)
+add_subdirectory(CodeGenData)
 add_subdirectory(CodeGenTypes)
 add_subdirectory(BinaryFormat)
 add_subdirectory(Bitcode)
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt
new file mode 100644
index 00000000000000..3ba90f96cc86d4
--- /dev/null
+++ b/llvm/lib/CodeGenData/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_llvm_component_library(LLVMCodeGenData
+  OutlinedHashTree.cpp
+  OutlinedHashTreeRecord.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGenData
+
+  DEPENDS
+  intrinsics_gen
+
+  LINK_COMPONENTS
+  Core
+  Support
+  )
diff --git a/llvm/lib/CodeGenData/OutlinedHashTree.cpp b/llvm/lib/CodeGenData/OutlinedHashTree.cpp
new file mode 100644
index 00000000000000..945014550a3886
--- /dev/null
+++ b/llvm/lib/CodeGenData/OutlinedHashTree.cpp
@@ -0,0 +1,106 @@
+//===-- OutlinedHashTree.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An OutlinedHashTree is a Trie that contains sequences of stable hash values
+// of instructions that have been outlined. This OutlinedHashTree can be used
+// to understand the outlined instruction sequences collected across modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+
+#include <stack>
+#include <tuple>
+
+#define DEBUG_TYPE "outlined-hash-tree"
+
+using namespace llvm;
+
+void OutlinedHashTree::walkGraph(EdgeCallbackFn CallbackEdge,
+                                 NodeCallbackFn CallbackNode) const {
+  std::stack<const HashNode *> Stack;
+  Stack.push(getRoot());
+
+  while (!Stack.empty()) {
+    const auto *Current = Stack.top();
+    Stack.pop();
+    CallbackNode(Current);
+
+    // Sorted walk for the stable output.
+    std::map<stable_hash, const HashNode *> SortedSuccessors;
+    for (const auto &P : Current->Successors)
+      SortedSuccessors[P.first] = P.second.get();
+
+    for (const auto &P : SortedSuccessors) {
+      CallbackEdge(Current, P.second);
+      Stack.push(P.second);
+    }
+  }
+}
+
+void OutlinedHashTree::insert(const HashSequencePair &SequencePair) {
+  const auto &Sequence = SequencePair.first;
+  unsigned Count = SequencePair.second;
+
+  HashNode *Current = getRoot();
+  for (stable_hash StableHash : Sequence) {
+    auto I = Current->Successors.find(StableHash);
+    if (I == Current->Successors.end()) {
+      std::unique_ptr<HashNode> Next = std::make_unique<HashNode>();
+      HashNode *NextPtr = Next.get();
+      NextPtr->Hash = StableHash;
+      Current->Successors.emplace(StableHash, std::move(Next));
+      Current = NextPtr;
+      continue;
+    }
+    Current = I->second.get();
+  }
+  Current->Terminals += Count;
+}
+
+void OutlinedHashTree::merge(const OutlinedHashTree *Tree) {
+  HashNode *Dst = getRoot();
+  const HashNode *Src = Tree->getRoot();
+
+  std::stack<std::pair<HashNode *, const HashNode *>> Stack;
+  Stack.push({Dst, Src});
+
+  while (!Stack.empty()) {
+    auto [DstNode, SrcNode] = Stack.top();
+    Stack.pop();
+
+    if (!SrcNode)
+      continue;
+    DstNode->Terminals += SrcNode->Terminals;
+
+    for (auto &[Hash, NextSrcNode] : SrcNode->Successors) {
+      HashNode *NextDstNode;
+      auto I = DstNode->Successors.find(Hash);
+      if (I == DstNode->Successors.end()) {
+        auto NextDst = std::make_unique<HashNode>();
+        NextDstNode = NextDst.get();
+        NextDstNode->Hash = Hash;
+        DstNode->Successors.emplace(Hash, std::move(NextDst));
+      } else
+        NextDstNode = I->second.get();
+
+      Stack.push({NextDstNode, NextSrcNode.get()});
+    }
+  }
+}
+
+unsigned OutlinedHashTree::find(const HashSequence &Sequence) const {
+  const HashNode *Current = getRoot();
+  for (stable_hash StableHash : Sequence) {
+    const auto I = Current->Successors.find(StableHash);
+    if (I == Current->Successors.end())
+      return 0;
+    Current = I->second.get();
+  }
+  return Current->Terminals;
+}
diff --git a/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp b/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
new file mode 100644
index 00000000000000..26dcd70cf50667
--- /dev/null
+++ b/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
@@ -0,0 +1,166 @@
+//===-- OutlinedHashTreeRecord.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines the OutlinedHashTreeRecord class. This class holds the outlined
+// hash tree for both serialization and deserialization processes. It utilizes
+// two data formats for serialization: raw binary data and YAML.
+// These two formats can be used interchangeably.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+#define DEBUG_TYPE "outlined-hash-tree"
+
+using namespace llvm;
+using namespace llvm::support;
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<HashNodeStable> {
+  static void mapping(IO &io, HashNodeStable &res) {
+    io.mapRequired("Hash", res.Hash);
+    io.mapRequired("Terminals", res.Terminals);
+    io.mapRequired("SuccessorIds", res.SuccessorIds);
+  }
+};
+
+template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
+  static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) {
+    HashNodeStable NodeStable;
+    io.mapRequired(Key.str().c_str(), NodeStable);
+    unsigned Id;
+    if (Key.getAsInteger(0, Id)) {
+      io.setError("Id not an integer");
+      return;
+    }
+    V.insert({Id, NodeStable});
+  }
+
+  static void output(IO &io, IdHashNodeStableMapTy &V) {
+    for (auto Iter = V.begin(); Iter != V.end(); ++Iter)
+      io.mapRequired(utostr(Iter->first).c_str(), Iter->second);
+  }
+};
+
+} // namespace yaml
+} // namespace llvm
+
+void OutlinedHashTreeRecord::serialize(raw_ostream &OS) const {
+  IdHashNodeStableMapTy IdNodeStableMap;
+  convertToStableData(IdNodeStableMap);
+
+  support::endian::Writer Writer(OS, endianness::little);
+  Writer.write<uint32_t>(IdNodeStableMap.size());
+  for (const auto &[Id, NodeStable] : IdNodeStableMap) {
+    Writer.write<uint32_t>(Id);
+    Writer.write<uint64_t>(NodeStable.Hash);
+    Writer.write<uint32_t>(NodeStable.Terminals);
+    Writer.write<uint32_t>(NodeStable.SuccessorIds.size());
+    for (auto SuccessorId : NodeStable.SuccessorIds)
+      Writer.write<uint32_t>(SuccessorId);
+  }
+}
+
+void OutlinedHashTreeRecord::deserialize(const unsigned char *&Ptr) {
+  IdHashNodeStableMapTy IdNodeStableMap;
+
+  auto NumIdNodeStableMap =
+      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  for (unsigned I = 0; I < NumIdNodeStableMap; ++I) {
+    auto Id = endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    HashNodeStable NodeStable;
+    NodeStable.Hash =
+        endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
+    NodeStable.Terminals =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    auto NumSuccessorIds =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    for (unsigned J = 0; J < NumSuccessorIds; ++J)
+      NodeStable.SuccessorIds.push_back(
+          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr));
+
+    IdNodeStableMap[Id] = std::move(NodeStable);
+  }
+
+  convertFromStableData(IdNodeStableMap);
+}
+
+void OutlinedHashTreeRecord::serializeYAML(yaml::Output &YOS) const {
+  IdHashNodeStableMapTy IdNodeStableMap;
+  convertToStableData(IdNodeStableMap);
+
+  YOS << IdNodeStableMap;
+}
+
+void OutlinedHashTreeRecord::deserializeYAML(yaml::Input &YIS) {
+  IdHashNodeStableMapTy IdNodeStableMap;
+
+  YIS >> IdNodeStableMap;
+  YIS.nextDocument();
+
+  convertFromStableData(IdNodeStableMap);
+}
+
+void OutlinedHashTreeRecord::convertToStableData(
+    IdHashNodeStableMapTy &IdNodeStableMap) const {
+  // Build NodeIdMap
+  HashNodeIdMapTy NodeIdMap;
+  HashTree->walkVertices([&NodeIdMap](const HashNode *Current) {
+    size_t Index = NodeIdMap.size();
+    NodeIdMap[Current] = Index;
+    assert(Index = NodeIdMap.size() + 1 &&
+                   "Expected size of NodeMap to increment by 1");
+  });
+
+  // Convert NodeIdMap to NodeStableMap
+  for (auto &P : NodeIdMap) {
+    auto *Node = P.first;
+    auto Id = P.second;
+    HashNodeStable NodeStable;
+    NodeStable.Hash = Node->Hash;
+    NodeStable.Terminals = Node->Terminals;
+    for (auto &P : Node->Successors)
+      NodeStable.SuccessorIds.push_back(NodeIdMap[P.second.get()]);
+    IdNodeStableMap[Id] = NodeStable;
+  }
+
+  // Sort the Successors so that they come out in the same order as in the map.
+  for (auto &P : IdNodeStableMap)
+    std::sort(P.second.SuccessorIds.begin(), P.second.SuccessorIds.end());
+}
+
+void OutlinedHashTreeRecord::convertFromStableData(
+    const IdHashNodeStableMapTy &IdNodeStableMap) {
+  IdHashNodeMapTy IdNodeMap;
+  // Initialize the root node at 0.
+  IdNodeMap[0] = HashTree->getRoot();
+  assert(IdNodeMap[0]->Successors.empty());
+
+  for (auto &P : IdNodeStableMap) {
+    auto Id = P.first;
+    const HashNodeStable &NodeStable = P.second;
+    assert(IdNodeMap.count(Id));
+    HashNode *Curr = IdNodeMap[Id];
+    Curr->Hash = NodeStable.Hash;
+    Curr->Terminals = NodeStable.Terminals;
+    auto &Successors = Curr->Successors;
+    assert(Successors.empty());
+    for (auto SuccessorId : NodeStable.SuccessorIds) {
+      auto Sucessor = std::make_unique<HashNode>();
+      IdNodeMap[SuccessorId] = Sucessor.get();
+      auto Hash = IdNodeStableMap.at(SuccessorId).Hash;
+      Successors[Hash] = std::move(Sucessor);
+    }
+  }
+}
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 46f30ff398e10d..cb4b8513e6d02e 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -21,6 +21,7 @@ add_subdirectory(BinaryFormat)
 add_subdirectory(Bitcode)
 add_subdirectory(Bitstream)
 add_subdirectory(CodeGen)
+add_subdirectory(CodeGenData)
 add_subdirectory(DebugInfo)
 add_subdirectory(Debuginfod)
 add_subdirectory(Demangle)
diff --git a/llvm/unittests/CodeGenData/CMakeLists.txt b/llvm/unittests/CodeGenData/CMakeLists.txt
new file mode 100644
index 00000000000000..3d821b87e29d8c
--- /dev/null
+++ b/llvm/unittests/CodeGenData/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  CodeGen
+  CodeGenData
+  Core
+  Support
+  )
+
+add_llvm_unittest(CodeGenDataTests
+  OutlinedHashTreeRecordTest.cpp
+  OutlinedHashTreeTest.cpp
+  )
+
+target_link_libraries(CodeGenDataTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp b/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
new file mode 100644
index 00000000000000..aa7ad4a33754ff
--- /dev/null
+++ b/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
@@ -0,0 +1,118 @@
+//===- OutlinedHashTreeRecordTest.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(OutlinedHashTreeRecordTest, Empty) {
+  OutlinedHashTreeRecord HashTreeRecord;
+  ASSERT_TRUE(HashTreeRecord.empty());
+}
+
+TEST(OutlinedHashTreeRecordTest, Print) {
+  OutlinedHashTreeRecord HashTreeRecord;
+  HashTreeRecord.HashTree->insert({{1, 2}, 3});
+
+  const char *ExpectedTreeStr = R"(---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       3
+  SuccessorIds:    [  ]
+...
+)";
+  std::string TreeDump;
+  raw_string_ostream OS(TreeDump);
+  HashTreeRecord.print(OS);
+  EXPECT_EQ(ExpectedTreeStr, TreeDump);
+}
+
+TEST(OutlinedHashTreeRecordTest, Stable) {
+  OutlinedHashTreeRecord HashTreeRecord1;
+  HashTreeRecord1.HashTree->insert({{1, 2}, 4});
+  HashTreeRecord1.HashTree->insert({{1, 3}, 5});
+
+  OutlinedHashTreeRecord HashTreeRecord2;
+  HashTreeRecord2.HashTree->insert({{1, 3}, 5});
+  HashTreeRecord2.HashTree->insert({{1, 2}, 4});
+
+  // Output is stable regardless of insertion order.
+  std::string TreeDump1;
+  raw_string_ostream OS1(TreeDump1);
+  HashTreeRecord1.print(OS1);
+  std::string TreeDump2;
+  raw_string_ostream OS2(TreeDump2);
+  HashTreeRecord2.print(OS2);
+
+  EXPECT_EQ(TreeDump1, TreeDump2);
+}
+
+TEST(OutlinedHashTreeRecordTest, Serialize) {
+  OutlinedHashTreeRecord HashTreeRecord1;
+  HashTreeRecord1.HashTree->insert({{1, 2}, 4});
+  HashTreeRecord1.HashTree->insert({{1, 3}, 5});
+
+  // Serialize and deserialize the tree.
+  SmallVector<char> Out;
+  raw_svector_ostream OS(Out);
+  HashTreeRecord1.serialize(OS);
+
+  OutlinedHashTreeRecord HashTreeRecord2;
+  const uint8_t *Data = reinterpret_cast<const uint8_t *>(Out.data());
+  HashTreeRecord2.deserialize(Data);
+
+  // Two trees should be identical.
+  std::string TreeDump1;
+  raw_string_ostream OS1(TreeDump1);
+  HashTreeRecord1.print(OS1);
+  std::string TreeDump2;
+  raw_string_ostream OS2(TreeDump2);
+  HashTreeRecord2.print(OS2);
+
+  EXPECT_EQ(TreeDump1, TreeDump2);
+}
+
+TEST(OutlinedHashTreeRecordTest, SerializeYAML) {
+  OutlinedHashTreeRecord HashTreeRecord1;
+  HashTreeRecord1.HashTree->insert({{1, 2}, 4});
+  HashTreeRecord1.HashTree->insert({{1, 3}, 5});
+
+  // Serialize and deserialize the tree in a YAML format.
+  std::string Out;
+  raw_string_ostream OS(Out);
+  yaml::Output YOS(OS);
+  HashTreeRecord1.serializeYAML(YOS);
+
+  OutlinedHashTreeRecord HashTreeRecord2;
+  yaml::Input YIS(StringRef(Out.data(), Out.size()));
+  HashTreeRecord2.deserializeYAML(YIS);
+
+  // Two trees should be identical.
+  std::string TreeDump1;
+  raw_string_ostream OS1(TreeDump1);
+  HashTreeRecord1.print(OS1);
+  std::string TreeDump2;
+  raw_string_ostream OS2(TreeDump2);
+  HashTreeRecord2.print(OS2);
+
+  EXPECT_EQ(TreeDump1, TreeDump2);
+}
+
+} // end namespace
diff --git a/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp b/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp
new file mode 100644
index 00000000000000..d11618cf8e4fae
--- /dev/null
+++ b/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp
@@ -0,0 +1,81 @@
+//===- OutlinedHashTreeTest.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(OutlinedHashTreeTest, Empty) {
+  OutlinedHashTree HashTree;
+  ASSERT_TRUE(HashTree.empty());
+  // The header node is always present.
+  ASSERT_TRUE(HashTree.size() == 1);
+  ASSERT_TRUE(HashTree.depth() == 0);
+}
+
+TEST(OutlinedHashTreeTest, Insert) {
+  OutlinedHashTree HashTree;
+  HashTree.insert({{1, 2, 3}, 1});
+  // The node count is 4 (including the root node).
+  ASSERT_TRUE(HashTree.size() == 4);
+  // The terminal count is 1.
+  ASSERT_TRUE(HashTree.size(/*GetTerminalCountOnly=*/true) == 1);
+  // The depth is 3.
+  ASSERT_TRUE(HashTree.depth() == 3);
+
+  HashTree.clear();
+  ASSERT_TRUE(HashTree.empty());
+
+  HashTree.insert({{1, 2, 3}, 1});
+  HashTree.insert({{1, 2, 4}, 2});
+  // The nodes of 1 and 2 are shared with the same prefix.
+  // The nodes are root, 1, 2, 3 and 4, whose counts are 5.
+  ASSERT_TRUE(HashTree.size() == 5);
+}
+
+TEST(OutlinedHashTreeTest, Find) {
+  OutlinedHashTree HashTree;
+  HashTree.insert({{1, 2, 3}, 1});
+  HashTree.insert({{1, 2, 3}, 2});
+
+  // The node count does not change as the same sequences are added.
+  ASSERT_TRUE(HashTree.size() == 4);
+  // The terminal counts are accumulated from two same sequences.
+  ASSERT_TRUE(HashTree.find({1, 2, 3}) == 3);
+  ASSERT_TRUE(HashTree.find({1, 2}) == 0);
+}
+
+TEST(OutlinedHashTreeTest, Merge) {
+  // Build HashTree1 inserting 2 sequences.
+  OutlinedHashTree HashTree1;
+
+  HashTree1.insert({{1, 2}, 20});
+  HashTree1.insert({{1, 4}, 30});
+
+  // Build HashTree2 and HashTree3 for each
+  OutlinedHashTree HashTree2;
+  HashTree2.insert({{1, 2}, 20});
+  OutlinedHashTree HashTree3;
+  HashTree3.insert({{1, 4}, 30});
+
+  // Merge HashTree3 into HashTree2.
+  HashTree2.merge(&HashTree3);
+
+  // Compare HashTree1 and HashTree2.
+  EXPECT_EQ(HashTree1.size(), HashTree2.size());
+  EXPECT_EQ(HashTree1.depth(), HashTree2.depth());
+  EXPECT_EQ(HashTree1.find({1, 2}), HashTree2.find({1, 2}));
+  EXPECT_EQ(HashTree1.find({1, 4}), HashTree2.find({1, 4}));
+  EXPECT_EQ(HashTree1.find({1, 3}), HashTree2.find({1, 3}));
+}
+
+} // end namespace

>From d22358a368cf8be4939b7274c5776b2f4c325150 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Tue, 23 Apr 2024 14:22:14 -0700
Subject: [PATCH 2/9] [CGData] llvm-cgdata

The llvm-cgdata tool has been introduced to handle reading and writing of codegen data. This data includes an optimistic codegen summary that can be utilized to enhance subsequent codegen. Currently, the tool supports saving and restoring the outlined hash tree, facilitating machine function outlining across modules. Additional codegen summaries can be incorporated into separate sections as required. This patch primarily establishes basic support for the reader and writer, similar to llvm-profdata.

The high-level operations of llvm-cgdata are as follows:
1. It reads local raw codegen data from a custom section (for example, __llvm_outline)  embedded in native binary files
2. It merges local raw codegen data into an indexed codegen data, complete with a suitable header.
3. It handles reading and writing of the indexed codegen data into a standalone file.
---
 llvm/include/llvm/CodeGenData/CodeGenData.h   | 202 +++++++++++++
 llvm/include/llvm/CodeGenData/CodeGenData.inc |  46 +++
 .../llvm/CodeGenData/CodeGenDataReader.h      | 154 ++++++++++
 .../llvm/CodeGenData/CodeGenDataWriter.h      |  68 +++++
 llvm/lib/CodeGenData/CMakeLists.txt           |   3 +
 llvm/lib/CodeGenData/CodeGenData.cpp          | 197 +++++++++++++
 llvm/lib/CodeGenData/CodeGenDataReader.cpp    | 174 ++++++++++++
 llvm/lib/CodeGenData/CodeGenDataWriter.cpp    | 162 +++++++++++
 llvm/test/CMakeLists.txt                      |   1 +
 llvm/test/lit.cfg.py                          |   1 +
 llvm/test/tools/llvm-cgdata/dump.test         |  30 ++
 llvm/test/tools/llvm-cgdata/empty.test        |  32 +++
 llvm/test/tools/llvm-cgdata/error.test        |  38 +++
 .../test/tools/llvm-cgdata/merge-archive.test |  75 +++++
 llvm/test/tools/llvm-cgdata/merge-concat.test |  68 +++++
 llvm/test/tools/llvm-cgdata/merge-double.test |  74 +++++
 llvm/test/tools/llvm-cgdata/merge-single.test |  43 +++
 llvm/test/tools/llvm-cgdata/show.test         |  30 ++
 llvm/tools/llvm-cgdata/CMakeLists.txt         |  15 +
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        | 268 ++++++++++++++++++
 20 files changed, 1681 insertions(+)
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenData.h
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenData.inc
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenDataReader.h
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
 create mode 100644 llvm/lib/CodeGenData/CodeGenData.cpp
 create mode 100644 llvm/lib/CodeGenData/CodeGenDataReader.cpp
 create mode 100644 llvm/lib/CodeGenData/CodeGenDataWriter.cpp
 create mode 100644 llvm/test/tools/llvm-cgdata/dump.test
 create mode 100644 llvm/test/tools/llvm-cgdata/empty.test
 create mode 100644 llvm/test/tools/llvm-cgdata/error.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-archive.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-concat.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-double.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-single.test
 create mode 100644 llvm/test/tools/llvm-cgdata/show.test
 create mode 100644 llvm/tools/llvm-cgdata/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-cgdata/llvm-cgdata.cpp

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h
new file mode 100644
index 00000000000000..118fb9841d27e8
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.h
@@ -0,0 +1,202 @@
+//===- CodeGenData.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for codegen data that has stable summary which
+// can be used to optimize the code in the subsequent codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATA_H
+#define LLVM_CODEGENDATA_CODEGENDATA_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/TargetParser/Triple.h"
+#include <mutex>
+
+namespace llvm {
+
+enum CGDataSectKind {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Kind,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+std::string getCodeGenDataSectionName(CGDataSectKind CGSK,
+                                      Triple::ObjectFormatType OF,
+                                      bool AddSegmentInfo = true);
+
+enum class CGDataKind {
+  Unknown = 0x0,
+  // A function outlining info.
+  FunctionOutlinedHashTree = 0x1,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionOutlinedHashTree)
+};
+
+const std::error_category &cgdata_category();
+
+enum class cgdata_error {
+  success = 0,
+  eof,
+  bad_magic,
+  bad_header,
+  empty_cgdata,
+  malformed,
+  unsupported_version,
+};
+
+inline std::error_code make_error_code(cgdata_error E) {
+  return std::error_code(static_cast<int>(E), cgdata_category());
+}
+
+class CGDataError : public ErrorInfo<CGDataError> {
+public:
+  CGDataError(cgdata_error Err, const Twine &ErrStr = Twine())
+      : Err(Err), Msg(ErrStr.str()) {
+    assert(Err != cgdata_error::success && "Not an error");
+  }
+
+  std::string message() const override;
+
+  void log(raw_ostream &OS) const override { OS << message(); }
+
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(Err);
+  }
+
+  cgdata_error get() const { return Err; }
+  const std::string &getMessage() const { return Msg; }
+
+  /// Consume an Error and return the raw enum value contained within it, and
+  /// the optional error message. The Error must either be a success value, or
+  /// contain a single CGDataError.
+  static std::pair<cgdata_error, std::string> take(Error E) {
+    auto Err = cgdata_error::success;
+    std::string Msg = "";
+    handleAllErrors(std::move(E), [&Err, &Msg](const CGDataError &IPE) {
+      assert(Err == cgdata_error::success && "Multiple errors encountered");
+      Err = IPE.get();
+      Msg = IPE.getMessage();
+    });
+    return {Err, Msg};
+  }
+
+  static char ID;
+
+private:
+  cgdata_error Err;
+  std::string Msg;
+};
+
+enum CGDataMode {
+  None,
+  Read,
+  Write,
+};
+
+class CodeGenData {
+  /// Global outlined hash tree that has oulined hash sequences across modules.
+  std::unique_ptr<OutlinedHashTree> PublishedHashTree;
+
+  /// This flag is set when -fcgdata-generate is passed.
+  /// Or, it can be mutated with -ftwo-codegen-rounds during two codegen runs.
+  bool EmitCGData;
+
+  /// This is a singleton instance which is thread-safe. Unlike profile data
+  /// which is largely function-based, codegen data describes the whole module.
+  /// Therefore, this can be initialized once, and can be used across modules
+  /// instead of constructing the same one for each codegen backend.
+  static std::unique_ptr<CodeGenData> Instance;
+  static std::once_flag OnceFlag;
+
+  CodeGenData() = default;
+
+public:
+  ~CodeGenData() = default;
+
+  static CodeGenData &getInstance();
+
+  /// Returns true if we have a valid outlined hash tree.
+  bool hasOutlinedHashTree() {
+    return PublishedHashTree && !PublishedHashTree->empty();
+  }
+
+  /// Returns the outlined hash tree. This can be globally used in a read-only
+  /// manner.
+  const OutlinedHashTree *getOutlinedHashTree() {
+    return PublishedHashTree.get();
+  }
+
+  /// Returns true if we should write codegen data.
+  bool emitCGData() { return EmitCGData; }
+
+  /// Publish the (globally) merged or read outlined hash tree.
+  void publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
+    PublishedHashTree = std::move(HashTree);
+    // Ensure we disable emitCGData as we do not want to read and write both.
+    EmitCGData = false;
+  }
+};
+
+namespace cgdata {
+
+inline bool hasOutlinedHashTree() {
+  return CodeGenData::getInstance().hasOutlinedHashTree();
+}
+
+inline const OutlinedHashTree *getOutlinedHashTree() {
+  return CodeGenData::getInstance().getOutlinedHashTree();
+}
+
+inline bool emitCGData() { return CodeGenData::getInstance().emitCGData(); }
+
+inline void
+publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
+  CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
+}
+
+void warn(Error E, StringRef Whence = "");
+void warn(Twine Message, std::string Whence = "", std::string Hint = "");
+
+} // end namespace cgdata
+
+namespace IndexedCGData {
+
+const uint64_t Magic = 0x81617461646763ff; // "\xffcgdata\x81"
+
+enum CGDataVersion {
+  // Version 1 is the first version. This version support the outlined
+  // hash tree.
+  Version1 = 1,
+  CurrentVersion = CG_DATA_INDEX_VERSION
+};
+const uint64_t Version = CGDataVersion::CurrentVersion;
+
+struct Header {
+  uint64_t Magic;
+  uint32_t Version;
+  uint32_t DataKind;
+  uint64_t OutlinedHashTreeOffset;
+
+  // New fields should only be added at the end to ensure that the size
+  // computation is correct. The methods below need to be updated to ensure that
+  // the new field is read correctly.
+
+  // Reads a header struct from the buffer.
+  static Expected<Header> readFromBuffer(const unsigned char *Curr);
+};
+
+} // end namespace IndexedCGData
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_PREPARE_H
diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.inc b/llvm/include/llvm/CodeGenData/CodeGenData.inc
new file mode 100644
index 00000000000000..5f6df5c0bf1065
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.inc
@@ -0,0 +1,46 @@
+/*===-- CodeGenData.inc ----------------------------------------*- C++ -*-=== *\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+/*
+ * This is the main file that defines all the data structure, signature,
+ * constant literals that are shared across compiler, host tools (reader/writer)
+ * to support codegen data.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifdef CG_DATA_SECT_ENTRY
+#define CG_DATA_DEFINED
+CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON),
+                   CG_DATA_OUTLINE_COFF, "__DATA,")
+
+#undef CG_DATA_SECT_ENTRY
+#endif
+
+/* section name strings common to all targets other
+   than WIN32 */
+#define CG_DATA_OUTLINE_COMMON __llvm_outline
+/* Since cg data sections are not allocated, we don't need to
+ * access them at runtime.
+ */
+#define CG_DATA_OUTLINE_COFF ".loutline"
+
+#ifdef _WIN32
+/* Runtime section names and name strings.  */
+#define CG_DATA_SECT_NAME CG_DATA_OUTLINE_COFF
+
+#else
+/* Runtime section names and name strings.  */
+#define CG_DATA_SECT_NAME INSTR_PROF_QUOTE(CG_DATA_OUTLINE_COMMON)
+
+#endif
+
+/* Indexed codegen data format version (start from 1). */
+#define CG_DATA_INDEX_VERSION 1
+
+/* Helper macros.  */
+#define CG_DATA_SIMPLE_QUOTE(x) #x
+#define CG_DATA_QUOTE(x) CG_DATA_SIMPLE_QUOTE(x)
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
new file mode 100644
index 00000000000000..df4ae3ed24e79a
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
@@ -0,0 +1,154 @@
+//===- CodeGenDataReader.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATAREADER_H
+#define LLVM_CODEGENDATA_CODEGENDATAREADER_H
+
+#include "llvm/CodeGenData/CodeGenData.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+namespace llvm {
+
+class CodeGenDataReader {
+  cgdata_error LastError = cgdata_error::success;
+  std::string LastErrorMsg;
+
+public:
+  CodeGenDataReader() = default;
+  virtual ~CodeGenDataReader() = default;
+
+  /// Read the header.  Required before reading first record.
+  virtual Error read() = 0;
+  /// Return the codegen data version.
+  virtual uint32_t getVersion() const = 0;
+  /// Return the codegen data kind.
+  virtual CGDataKind getDataKind() const = 0;
+  /// Return true if the data has an outlined hash tree.
+  virtual bool hasOutlinedHashTree() const = 0;
+  /// Return the outlined hash tree that is released from the reader.
+  std::unique_ptr<OutlinedHashTree> releaseOutlinedHashTree() {
+    return std::move(HashTreeRecord.HashTree);
+  }
+
+  /// Factory method to create an appropriately typed reader for the given
+  /// codegen data file path and file system.
+  static Expected<std::unique_ptr<CodeGenDataReader>>
+  create(const Twine &Path, vfs::FileSystem &FS);
+
+  /// Factory method to create an appropriately typed reader for the given
+  /// memory buffer.
+  static Expected<std::unique_ptr<CodeGenDataReader>>
+  create(std::unique_ptr<MemoryBuffer> Buffer);
+
+  /// Extract the cgdata embedded in sections from the given object file and
+  /// merge them into the GlobalOutlineRecord. This is a static helper that
+  /// is used by `llvm-cgdata merge` or ThinLTO's two-codegen rounds.
+  static Error mergeFromObjectFile(const object::ObjectFile *Obj,
+                                   OutlinedHashTreeRecord &GlobalOutlineRecord);
+
+protected:
+  /// The outlined hash tree that has been read. When it's released by
+  /// releaseOutlinedHashTree(), it's no longer valid.
+  OutlinedHashTreeRecord HashTreeRecord;
+
+  /// Set the current error and return same.
+  Error error(cgdata_error Err, const std::string &ErrMsg = "") {
+    LastError = Err;
+    LastErrorMsg = ErrMsg;
+    if (Err == cgdata_error::success)
+      return Error::success();
+    return make_error<CGDataError>(Err, ErrMsg);
+  }
+
+  Error error(Error &&E) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      LastError = IPE.get();
+      LastErrorMsg = IPE.getMessage();
+    });
+    return make_error<CGDataError>(LastError, LastErrorMsg);
+  }
+
+  /// Clear the current error and return a successful one.
+  Error success() { return error(cgdata_error::success); }
+};
+
+class IndexedCodeGenDataReader : public CodeGenDataReader {
+  /// The codegen data file contents.
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// The header
+  IndexedCGData::Header Header;
+
+public:
+  IndexedCodeGenDataReader(std::unique_ptr<MemoryBuffer> DataBuffer)
+      : DataBuffer(std::move(DataBuffer)) {}
+  IndexedCodeGenDataReader(const IndexedCodeGenDataReader &) = delete;
+  IndexedCodeGenDataReader &
+  operator=(const IndexedCodeGenDataReader &) = delete;
+
+  /// Return true if the given buffer is in binary codegen data format.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+  /// Read the contents including the header.
+  Error read() override;
+  /// Return the codegen data version.
+  uint32_t getVersion() const override { return Header.Version; }
+  /// Return the codegen data kind.
+  CGDataKind getDataKind() const override {
+    return static_cast<CGDataKind>(Header.DataKind);
+  }
+  /// Return true if the header indicates the data has an outlined hash tree.
+  /// This does not mean that the data is still available.
+  bool hasOutlinedHashTree() const override {
+    return Header.DataKind &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+};
+
+/// This format is a simple text format that's suitable for test data.
+/// The header is a custom format starting with `:` per line to indicate which
+/// codegen data is recorded. `#` is used to indicate a comment.
+/// The subsequent data is a YAML format per each codegen data in order.
+/// Currently, it only has a function outlined hash tree.
+class TextCodeGenDataReader : public CodeGenDataReader {
+  /// The codegen data file contents.
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// Iterator over the profile data.
+  line_iterator Line;
+  /// Describe the kind of the codegen data.
+  CGDataKind DataKind = CGDataKind::Unknown;
+
+public:
+  TextCodeGenDataReader(std::unique_ptr<MemoryBuffer> DataBuffer_)
+      : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#') {}
+  TextCodeGenDataReader(const TextCodeGenDataReader &) = delete;
+  TextCodeGenDataReader &operator=(const TextCodeGenDataReader &) = delete;
+
+  /// Return true if the given buffer is in text codegen data format.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+  /// Read the contents including the header.
+  Error read() override;
+  /// Text format does not have version, so return 0.
+  uint32_t getVersion() const override { return 0; }
+  /// Return the codegen data kind.
+  CGDataKind getDataKind() const override { return DataKind; }
+  /// Return true if the header indicates the data has an outlined hash tree.
+  /// This does not mean that the data is still available.
+  bool hasOutlinedHashTree() const override {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_CODEGENDATAREADER_H
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
new file mode 100644
index 00000000000000..e17ffc3482ec91
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
@@ -0,0 +1,68 @@
+//===- CodeGenDataWriter.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+#define LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+
+#include "llvm/CodeGenData/CodeGenData.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
+class CGDataOStream;
+
+class CodeGenDataWriter {
+  /// The outlined hash tree to be written.
+  OutlinedHashTreeRecord HashTreeRecord;
+
+  /// A bit mask describing the kind of the codegen data.
+  CGDataKind DataKind = CGDataKind::Unknown;
+
+public:
+  CodeGenDataWriter() = default;
+  ~CodeGenDataWriter() = default;
+
+  /// Add the outlined hash tree record. The input Record is released.
+  void addRecord(OutlinedHashTreeRecord &Record);
+
+  /// Write the codegen data to \c OS
+  Error write(raw_fd_ostream &OS);
+
+  /// Write the codegen data in text format to \c OS
+  Error writeText(raw_fd_ostream &OS);
+
+  /// Return the attributes of the current CGData.
+  CGDataKind getCGDataKind() const { return DataKind; }
+
+  /// Return true if the header indicates the data has an outlined hash tree.
+  bool hasOutlinedHashTree() const {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+
+private:
+  /// The offset of the outlined hash tree in the file.
+  uint64_t OutlinedHashTreeOffset;
+
+  /// Write the codegen data header to \c COS
+  Error writeHeader(CGDataOStream &COS);
+
+  /// Write the codegen data header in text to \c OS
+  Error writeHeaderText(raw_fd_ostream &OS);
+
+  Error writeImpl(CGDataOStream &COS);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_CODEGENDATAWRITER_H
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt
index 3ba90f96cc86d4..1156d53afb2e0f 100644
--- a/llvm/lib/CodeGenData/CMakeLists.txt
+++ b/llvm/lib/CodeGenData/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_llvm_component_library(LLVMCodeGenData
+  CodeGenData.cpp
+  CodeGenDataReader.cpp
+  CodeGenDataWriter.cpp
   OutlinedHashTree.cpp
   OutlinedHashTreeRecord.cpp
 
diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp
new file mode 100644
index 00000000000000..3bd21c97c7de7a
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenData.cpp
@@ -0,0 +1,197 @@
+//===-- CodeGenData.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for codegen data that has stable summary which
+// can be used to optimize the code in the subsequent codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/WithColor.h"
+
+#define DEBUG_TYPE "cg-data"
+
+using namespace llvm;
+using namespace cgdata;
+
+static std::string getCGDataErrString(cgdata_error Err,
+                                      const std::string &ErrMsg = "") {
+  std::string Msg;
+  raw_string_ostream OS(Msg);
+
+  switch (Err) {
+  case cgdata_error::success:
+    OS << "success";
+    break;
+  case cgdata_error::eof:
+    OS << "end of File";
+    break;
+  case cgdata_error::bad_magic:
+    OS << "invalid codegen data (bad magic)";
+    break;
+  case cgdata_error::bad_header:
+    OS << "invalid codegen data (file header is corrupt)";
+    break;
+  case cgdata_error::empty_cgdata:
+    OS << "empty codegen data";
+    break;
+  case cgdata_error::malformed:
+    OS << "malformed codegen data";
+    break;
+  case cgdata_error::unsupported_version:
+    OS << "unsupported codegen data version";
+    break;
+  }
+
+  // If optional error message is not empty, append it to the message.
+  if (!ErrMsg.empty())
+    OS << ": " << ErrMsg;
+
+  return OS.str();
+}
+
+namespace {
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class CGDataErrorCategoryType : public std::error_category {
+  const char *name() const noexcept override { return "llvm.cgdata"; }
+
+  std::string message(int IE) const override {
+    return getCGDataErrString(static_cast<cgdata_error>(IE));
+  }
+};
+
+} // end anonymous namespace
+
+const std::error_category &llvm::cgdata_category() {
+  static CGDataErrorCategoryType ErrorCategory;
+  return ErrorCategory;
+}
+
+std::string CGDataError::message() const {
+  return getCGDataErrString(Err, Msg);
+}
+
+char CGDataError::ID = 0;
+
+namespace {
+
+const char *CodeGenDataSectNameCommon[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
+  SectNameCommon,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+const char *CodeGenDataSectNameCoff[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
+  SectNameCoff,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+const char *CodeGenDataSectNamePrefix[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Prefix,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+} // namespace
+
+namespace llvm {
+
+std::string getCodeGenDataSectionName(CGDataSectKind CGSK,
+                                      Triple::ObjectFormatType OF,
+                                      bool AddSegmentInfo) {
+  std::string SectName;
+
+  if (OF == Triple::MachO && AddSegmentInfo)
+    SectName = CodeGenDataSectNamePrefix[CGSK];
+
+  if (OF == Triple::COFF)
+    SectName += CodeGenDataSectNameCoff[CGSK];
+  else
+    SectName += CodeGenDataSectNameCommon[CGSK];
+
+  return SectName;
+}
+
+std::unique_ptr<CodeGenData> CodeGenData::Instance = nullptr;
+std::once_flag CodeGenData::OnceFlag;
+
+CodeGenData &CodeGenData::getInstance() {
+  std::call_once(CodeGenData::OnceFlag, []() {
+    auto *CGD = new CodeGenData();
+    Instance.reset(CGD);
+
+    // TODO: Initialize writer or reader mode for the client optimization.
+  });
+  return *(Instance.get());
+}
+
+namespace IndexedCGData {
+
+Expected<Header> Header::readFromBuffer(const unsigned char *Curr) {
+  using namespace support;
+
+  static_assert(std::is_standard_layout_v<llvm::IndexedCGData::Header>,
+                "The header should be standard layout type since we use offset "
+                "of fields to read.");
+  Header H;
+  H.Magic = endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
+  if (H.Magic != IndexedCGData::Magic)
+    return make_error<CGDataError>(cgdata_error::bad_magic);
+  H.Version = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
+  if (H.Version > IndexedCGData::CGDataVersion::CurrentVersion)
+    return make_error<CGDataError>(cgdata_error::unsupported_version);
+  H.DataKind = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
+
+  switch (H.Version) {
+    // When a new field is added to the header add a case statement here to
+    // compute the size as offset of the new field + size of the new field. This
+    // relies on the field being added to the end of the list.
+    static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version1,
+                  "Please update the size computation below if a new field has "
+                  "been added to the header, if not add a case statement to "
+                  "fall through to the latest version.");
+  case 1ull:
+    H.OutlinedHashTreeOffset =
+        endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
+  }
+
+  return H;
+}
+
+} // end namespace IndexedCGData
+
+namespace cgdata {
+
+void warn(Twine Message, std::string Whence, std::string Hint) {
+  WithColor::warning();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+}
+
+void warn(Error E, StringRef Whence) {
+  if (E.isA<CGDataError>()) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      warn(IPE.message(), std::string(Whence), std::string(""));
+    });
+  }
+}
+
+} // end namespace cgdata
+
+} // end namespace llvm
diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
new file mode 100644
index 00000000000000..1b08085dec2f25
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
@@ -0,0 +1,174 @@
+//===- CodeGenDataReader.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+#define DEBUG_TYPE "cg-data-reader"
+
+using namespace llvm;
+
+namespace llvm {
+
+static Expected<std::unique_ptr<MemoryBuffer>>
+setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
+  auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN()
+                                           : FS.getBufferForFile(Filename);
+  if (std::error_code EC = BufferOrErr.getError())
+    return errorCodeToError(EC);
+  return std::move(BufferOrErr.get());
+}
+
+Error CodeGenDataReader::mergeFromObjectFile(
+    const object::ObjectFile *Obj,
+    OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  Triple TT = Obj->makeTriple();
+  auto CGOutLineName =
+      getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false);
+
+  for (auto &Section : Obj->sections()) {
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    Expected<StringRef> ContentsOrErr = Section.getContents();
+    if (!ContentsOrErr)
+      return ContentsOrErr.takeError();
+    auto *Data = reinterpret_cast<const unsigned char *>(ContentsOrErr->data());
+    auto *EndData = Data + ContentsOrErr->size();
+
+    if (*NameOrErr == CGOutLineName) {
+      // In case dealing with an executable that has concatenaed cgdata,
+      // we want to merge them into a single cgdata.
+      // Although it's not a typical workflow, we support this scenario.
+      while (Data != EndData) {
+        OutlinedHashTreeRecord LocalOutlineRecord;
+        LocalOutlineRecord.deserialize(Data);
+        GlobalOutlineRecord.merge(LocalOutlineRecord);
+      }
+    }
+    // TODO: Add support for other cgdata sections.
+  }
+
+  return Error::success();
+}
+
+Error IndexedCodeGenDataReader::read() {
+  using namespace support;
+
+  // The smallest header with the version 1 is 24 bytes
+  const unsigned MinHeaderSize = 24;
+  if (DataBuffer->getBufferSize() < MinHeaderSize)
+    return error(cgdata_error::bad_header);
+
+  auto *Start =
+      reinterpret_cast<const unsigned char *>(DataBuffer->getBufferStart());
+  auto *End =
+      reinterpret_cast<const unsigned char *>(DataBuffer->getBufferEnd());
+  auto HeaderOr = IndexedCGData::Header::readFromBuffer(Start);
+  if (!HeaderOr)
+    return HeaderOr.takeError();
+  Header = HeaderOr.get();
+
+  if (hasOutlinedHashTree()) {
+    const unsigned char *Ptr = Start + Header.OutlinedHashTreeOffset;
+    if (Ptr >= End)
+      return error(cgdata_error::eof);
+    HashTreeRecord.deserialize(Ptr);
+  }
+
+  return success();
+}
+
+Expected<std::unique_ptr<CodeGenDataReader>>
+CodeGenDataReader::create(const Twine &Path, vfs::FileSystem &FS) {
+  // Set up the buffer to read.
+  auto BufferOrError = setupMemoryBuffer(Path, FS);
+  if (Error E = BufferOrError.takeError())
+    return std::move(E);
+  return CodeGenDataReader::create(std::move(BufferOrError.get()));
+}
+
+Expected<std::unique_ptr<CodeGenDataReader>>
+CodeGenDataReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+  if (Buffer->getBufferSize() == 0)
+    return make_error<CGDataError>(cgdata_error::empty_cgdata);
+
+  std::unique_ptr<CodeGenDataReader> Reader;
+  // Create the reader.
+  if (IndexedCodeGenDataReader::hasFormat(*Buffer))
+    Reader.reset(new IndexedCodeGenDataReader(std::move(Buffer)));
+  else if (TextCodeGenDataReader::hasFormat(*Buffer))
+    Reader.reset(new TextCodeGenDataReader(std::move(Buffer)));
+  else
+    return make_error<CGDataError>(cgdata_error::malformed);
+
+  // Initialize the reader and return the result.
+  if (Error E = Reader->read())
+    return std::move(E);
+
+  return std::move(Reader);
+}
+
+bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) {
+  using namespace support;
+  if (DataBuffer.getBufferSize() < 8)
+    return false;
+
+  uint64_t Magic = endian::read<uint64_t, llvm::endianness::little, aligned>(
+      DataBuffer.getBufferStart());
+  // Verify that it's magical.
+  return Magic == IndexedCGData::Magic;
+}
+
+bool TextCodeGenDataReader::hasFormat(const MemoryBuffer &Buffer) {
+  // Verify that this really looks like plain ASCII text by checking a
+  // 'reasonable' number of characters (up to profile magic size).
+  size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t));
+  StringRef buffer = Buffer.getBufferStart();
+  return count == 0 ||
+         std::all_of(buffer.begin(), buffer.begin() + count,
+                     [](char c) { return isPrint(c) || isSpace(c); });
+}
+Error TextCodeGenDataReader::read() {
+  using namespace support;
+
+  // Parse the custom header line by line.
+  while (Line->starts_with(":")) {
+    StringRef Str = Line->substr(1);
+    if (Str.equals_insensitive("outlined_hash_tree"))
+      DataKind |= CGDataKind::FunctionOutlinedHashTree;
+    else
+      return error(cgdata_error::bad_header);
+    ++Line;
+  }
+
+  // We treat an empty header (that as a comment # only) as a valid header.
+  if (Line.is_at_eof()) {
+    if (DataKind != CGDataKind::Unknown)
+      return error(cgdata_error::bad_header);
+    return Error::success();
+  }
+
+  // The YAML docs follow after the header.
+  const char *Pos = (*Line).data();
+  size_t Size = reinterpret_cast<size_t>(DataBuffer->getBufferEnd()) -
+                reinterpret_cast<size_t>(Pos);
+  yaml::Input YOS(StringRef(Pos, Size));
+  if (hasOutlinedHashTree())
+    HashTreeRecord.deserializeYAML(YOS);
+
+  // TODO: Add more yaml cgdata in order
+
+  return Error::success();
+}
+} // end namespace llvm
diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
new file mode 100644
index 00000000000000..9aa0d86223f714
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
@@ -0,0 +1,162 @@
+//===- CodeGenDataWriter.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+#define DEBUG_TYPE "cg-data-writer"
+
+using namespace llvm;
+
+namespace llvm {
+
+/// A struct to define how the data stream should be patched.
+struct CGDataPatchItem {
+  uint64_t Pos; // Where to patch.
+  uint64_t *D;  // Pointer to an array of source data.
+  int N;        // Number of elements in \c D array.
+};
+
+// A wrapper class to abstract writer stream with support of bytes
+// back patching.
+class CGDataOStream {
+public:
+  CGDataOStream(raw_fd_ostream &FD)
+      : IsFDOStream(true), OS(FD), LE(FD, llvm::endianness::little) {}
+  CGDataOStream(raw_string_ostream &STR)
+      : IsFDOStream(false), OS(STR), LE(STR, llvm::endianness::little) {}
+
+  uint64_t tell() { return OS.tell(); }
+  void write(uint64_t V) { LE.write<uint64_t>(V); }
+  void write32(uint32_t V) { LE.write<uint32_t>(V); }
+  void write8(uint8_t V) { LE.write<uint8_t>(V); }
+
+  // \c patch can only be called when all data is written and flushed.
+  // For raw_string_ostream, the patch is done on the target string
+  // directly and it won't be reflected in the stream's internal buffer.
+  void patch(ArrayRef<CGDataPatchItem> P) {
+    using namespace support;
+
+    if (IsFDOStream) {
+      raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
+      const uint64_t LastPos = FDOStream.tell();
+      for (const auto &K : P) {
+        FDOStream.seek(K.Pos);
+        for (int I = 0; I < K.N; I++)
+          write(K.D[I]);
+      }
+      // Reset the stream to the last position after patching so that users
+      // don't accidentally overwrite data. This makes it consistent with
+      // the string stream below which replaces the data directly.
+      FDOStream.seek(LastPos);
+    } else {
+      raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
+      std::string &Data = SOStream.str(); // with flush
+      for (const auto &K : P) {
+        for (int I = 0; I < K.N; I++) {
+          uint64_t Bytes =
+              endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+          Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
+                       (const char *)&Bytes, sizeof(uint64_t));
+        }
+      }
+    }
+  }
+
+  // If \c OS is an instance of \c raw_fd_ostream, this field will be
+  // true. Otherwise, \c OS will be an raw_string_ostream.
+  bool IsFDOStream;
+  raw_ostream &OS;
+  support::endian::Writer LE;
+};
+
+} // end namespace llvm
+
+void CodeGenDataWriter::addRecord(OutlinedHashTreeRecord &Record) {
+  assert(Record.HashTree && "empty hash tree in the record");
+  HashTreeRecord.HashTree = std::move(Record.HashTree);
+
+  DataKind |= CGDataKind::FunctionOutlinedHashTree;
+}
+
+Error CodeGenDataWriter::write(raw_fd_ostream &OS) {
+  CGDataOStream COS(OS);
+  return writeImpl(COS);
+}
+
+Error CodeGenDataWriter::writeHeader(CGDataOStream &COS) {
+  using namespace support;
+  IndexedCGData::Header Header;
+  Header.Magic = IndexedCGData::Magic;
+  Header.Version = IndexedCGData::Version;
+
+  // Set the CGDataKind depending on the kind.
+  Header.DataKind = 0;
+  if (static_cast<bool>(DataKind & CGDataKind::FunctionOutlinedHashTree))
+    Header.DataKind |=
+        static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+
+  Header.OutlinedHashTreeOffset = 0;
+
+  // Only write out up to the CGDataKind. We need to remember the offest of the
+  // remaing fields to allow back patching later.
+  COS.write(Header.Magic);
+  COS.write32(Header.Version);
+  COS.write32(Header.DataKind);
+
+  // Save the location of Header.OutlinedHashTreeOffset field in \c COS.
+  OutlinedHashTreeOffset = COS.tell();
+
+  // Reserve the space for OutlinedHashTreeOffset field.
+  COS.write(0);
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeImpl(CGDataOStream &COS) {
+  if (Error E = writeHeader(COS))
+    return E;
+
+  uint64_t OutlinedHashTreeFieldStart = COS.tell();
+  if (hasOutlinedHashTree())
+    HashTreeRecord.serialize(COS.OS);
+
+  // Back patch the offsets.
+  CGDataPatchItem PatchItems[] = {
+      {OutlinedHashTreeOffset, &OutlinedHashTreeFieldStart, 1}};
+  COS.patch(PatchItems);
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeHeaderText(raw_fd_ostream &OS) {
+  if (hasOutlinedHashTree())
+    OS << "# Outlined stable hash tree\n:outlined_hash_tree\n";
+
+  // TODO: Add more data types in this header
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeText(raw_fd_ostream &OS) {
+  if (Error E = writeHeaderText(OS))
+    return E;
+
+  yaml::Output YOS(OS);
+  if (hasOutlinedHashTree())
+    HashTreeRecord.serializeYAML(YOS);
+
+  // TODO: Write more yaml cgdata in order
+
+  return Error::success();
+}
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 6127b76db06b7f..be777ce650e874 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -73,6 +73,7 @@ set(LLVM_TEST_DEPENDS
           llvm-c-test
           llvm-cat
           llvm-cfi-verify
+          llvm-cgdata
           llvm-config
           llvm-cov
           llvm-cvtres
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 4c05317036d1a3..cc7e9d535a9c33 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -180,6 +180,7 @@ def get_asan_rtlib():
         "llvm-addr2line",
         "llvm-bcanalyzer",
         "llvm-bitcode-strip",
+        "llvm-cgdata",
         "llvm-config",
         "llvm-cov",
         "llvm-cxxdump",
diff --git a/llvm/test/tools/llvm-cgdata/dump.test b/llvm/test/tools/llvm-cgdata/dump.test
new file mode 100644
index 00000000000000..ce2ad27a5ff81c
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/dump.test
@@ -0,0 +1,30 @@
+# Test dump between the binary and text formats.
+
+RUN: split-file %s %t
+
+RUN: llvm-cgdata dump -binary %t/dump.cgtext -o %t/dump.cgdata
+RUN: llvm-cgdata dump -text %t/dump.cgdata -o %t/dump-round.cgtext
+RUN: llvm-cgdata dump -binary %t/dump-round.cgtext -o %t/dump-round.cgdata
+RUN: diff %t/dump.cgdata %t/dump-round.cgdata
+
+;--- dump.cgtext
+# Outlined stable hash tree
+:outlined_hash_tree
+---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2, 3 ]
+2:
+  Hash:            0x3
+  Terminals:       5
+  SuccessorIds:    [  ]
+3:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
new file mode 100644
index 00000000000000..d5e201b9eec17f
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -0,0 +1,32 @@
+# Test for empty cgdata file, which is invalid.
+RUN: touch %t_emptyfile.cgtext
+RUN: not llvm-cgdata dump %t_emptyfile.cgtext -text -o - 2>&1 | FileCheck %s --check-prefix ERROR
+ERROR: {{.}}emptyfile.cgtext: empty codegen data
+
+# Test for empty header in the text format. It can be converted to a valid binary file.
+RUN: printf '#' > %t_emptyheader.cgtext
+RUN: llvm-cgdata dump %t_emptyheader.cgtext -binary -o %t_emptyheader.cgdata
+
+# Without any cgdata other than the header, no data shows by default.
+RUN: llvm-cgdata show %t_emptyheader.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
+EMPTY-NOT: any
+
+# The version number appears when asked, as it's in the header
+RUN: llvm-cgdata show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix VERSION
+VERSION: Version: {{.}}
+
+# When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
+RUN: llvm-cgdata dump %t_emptyheader.cgdata -text -o - | FileCheck %s --allow-empty --check-prefix EMPTY
+
+# Synthesize a header only cgdata.
+# struct Header {
+#   uint64_t Magic;
+#   uint32_t Version;
+#   uint32_t DataKind;
+#   uint64_t OutlinedHashTreeOffset;
+# }
+RUN: printf '\xffcgdata\x81' > %t_header.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: diff %t_header.cgdata %t_emptyheader.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
new file mode 100644
index 00000000000000..5e1b14de5e509d
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -0,0 +1,38 @@
+# Test various error cases
+
+# Synthesize a header only cgdata.
+# struct Header {
+#   uint64_t Magic;
+#   uint32_t Version;
+#   uint32_t DataKind;
+#   uint64_t OutlinedHashTreeOffset;
+# }
+RUN: touch %t_empty.cgdata
+RUN: not llvm-cgdata show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix EMPTY
+EMPTY: {{.}}cgdata: empty codegen data
+
+# Not a magic.
+RUN: printf '\xff' > %t_malformed.cgdata
+RUN: not llvm-cgdata show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix MALFORMED
+MALFORMED: {{.}}cgdata: malformed codegen data
+
+# The minimum header size is 24.
+RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
+RUN: not llvm-cgdata show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix CORRUPT
+CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
+
+# The current version 1 while the header says 2.
+RUN: printf '\xffcgdata\x81' > %t_version.cgdata
+RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: not llvm-cgdata show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix BAD_VERSION
+BAD_VERSION: {{.}}cgdata: unsupported codegen data version
+
+# Header says an outlined hash tree, but the file ends after the header.
+RUN: printf '\xffcgdata\x81' > %t_eof.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
+RUN: not llvm-cgdata show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix EOF
+EOF: {{.}}cgdata: end of File
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
new file mode 100644
index 00000000000000..a27d6c2a16f4ab
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -0,0 +1,75 @@
+# Merge an archive that has two object files having cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
+RUN: llvm-cgdata merge %t/merge-archive.a -o %t/merge-archive.cgdata
+RUN: llvm-cgdata show %t/merge-archive.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge-archive.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-1.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
+;--- merge-2.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
new file mode 100644
index 00000000000000..3411133cb7aacb
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -0,0 +1,68 @@
+# Merge a binary file (e.g., a linked executable) having concatnated cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
+RUN: llvm-cgdata merge %t/merge-concat.o -o %t/merge-concat.cgdata
+RUN: llvm-cgdata show %t/merge-concat.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge-concat.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-concat.ll
+
+; In an linked executable (as opposed to an object file), cgdata in __llvm_outline might be concatenated. Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated. In other word, the following two trees are encoded back-to-back in a binary format.
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data1 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+ at .data2 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
new file mode 100644
index 00000000000000..6ce358cd72325b
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -0,0 +1,74 @@
+# Merge two object files having cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+RUN: llvm-cgdata show %t/merge.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-1.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
+;--- merge-2.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
new file mode 100644
index 00000000000000..73bdd9800dbe1d
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -0,0 +1,43 @@
+# Test merge a single object file into a cgdata
+
+RUN: split-file %s %t
+
+# Merge an object file that has no cgdata (__llvm_outline). It still produces a header only cgdata.
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-empty.ll -o %t/merge-empty.o
+RUN: llvm-cgdata merge %t/merge-empty.o -o %t/merge-empty.cgdata
+RUN: llvm-cgdata show %t/merge-empty.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
+EMPTY-NOT: any
+
+
+# Merge an object file having cgdata (__llvm_outline)
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
+RUN: llvm-cgdata merge %t/merge-single.o -o %t/merge-single.cgdata
+RUN: llvm-cgdata show %t/merge-single.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 3
+CHECK-NEXT:  Terminal Node Count: 1
+CHECK-NEXT:  Depth: 2
+
+;--- merge-empty.ll
+ at .data = private unnamed_addr constant [1 x i8] c"\01"
+
+;--- merge-single.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
diff --git a/llvm/test/tools/llvm-cgdata/show.test b/llvm/test/tools/llvm-cgdata/show.test
new file mode 100644
index 00000000000000..accb4b77ede246
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/show.test
@@ -0,0 +1,30 @@
+# Test show
+
+RUN: split-file %s %t
+RUN: llvm-cgdata show %t/show.cgtext | FileCheck %s
+
+CHECK: Outlined hash tree:
+CHECK-NEXT:   Total Node Count: 3
+CHECK-NEXT:   Terminal Node Count: 1
+CHECK-NEXT:   Depth: 2
+
+# Convert the text file to the binary file
+RUN: llvm-cgdata dump -binary %t/show.cgtext -o %t/show.cgdata
+RUN: llvm-cgdata show %t/show.cgdata | FileCheck %s
+
+;--- show.cgtext
+:outlined_hash_tree
+---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       3
+  SuccessorIds:    [  ]
+...
diff --git a/llvm/tools/llvm-cgdata/CMakeLists.txt b/llvm/tools/llvm-cgdata/CMakeLists.txt
new file mode 100644
index 00000000000000..4f1f7ff635bc3c
--- /dev/null
+++ b/llvm/tools/llvm-cgdata/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(LLVM_LINK_COMPONENTS
+  CodeGen
+  CodeGenData
+  Core
+  Object
+  Support
+  )
+
+add_llvm_tool(llvm-cgdata
+  llvm-cgdata.cpp
+
+  DEPENDS
+  intrinsics_gen
+  GENERATE_DRIVER
+  )
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
new file mode 100644
index 00000000000000..195f066fd6b872
--- /dev/null
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -0,0 +1,268 @@
+//===-- llvm-cgdata.cpp - LLVM CodeGen Data Tool --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// llvm-cgdata parses raw codegen data embedded in compiled binary files, and
+// merges them into a single .cgdata file. It can also inspect and maninuplate
+// a .cgdata file. This .cgdata can contain various codegen data like outlining
+// information, and it can be used to optimize the code in the subsequent build.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LLVMDriver.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+// TODO: https://llvm.org/docs/CommandGuide/llvm-cgdata.html has documentations
+// on each subcommand.
+cl::SubCommand DumpSubcommand(
+    "dump",
+    "Dump the (indexed) codegen data file in either text or binary format.");
+cl::SubCommand MergeSubcommand(
+    "merge", "Takes binary files having raw codegen data in custom sections, "
+             "and merge them into an index codegen data file.");
+cl::SubCommand
+    ShowSubcommand("show", "Show summary of the (indexed) codegen data file.");
+
+enum CGDataFormat {
+  CD_None = 0,
+  CD_Text,
+  CD_Binary,
+};
+
+cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                    cl::init("-"), cl::desc("Output file"),
+                                    cl::sub(DumpSubcommand),
+                                    cl::sub(MergeSubcommand));
+cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                          cl::aliasopt(OutputFilename));
+
+cl::opt<std::string> Filename(cl::Positional, cl::desc("<cgdata-file>"),
+                              cl::sub(DumpSubcommand), cl::sub(ShowSubcommand));
+cl::list<std::string> InputFilenames(cl::Positional, cl::sub(MergeSubcommand),
+                                     cl::desc("<binary-files...>"));
+cl::opt<CGDataFormat> OutputFormat(
+    cl::desc("Format of output data"), cl::sub(DumpSubcommand),
+    cl::init(CD_Text),
+    cl::values(clEnumValN(CD_Text, "text", "Text encoding"),
+               clEnumValN(CD_Binary, "binary", "Binary encoding")));
+
+cl::opt<bool> ShowCGDataVersion("cgdata-version", cl::init(false),
+                                cl::desc("Show cgdata version. "),
+                                cl::sub(ShowSubcommand));
+
+static void exitWithError(Twine Message, std::string Whence = "",
+                          std::string Hint = "") {
+  WithColor::error();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+  ::exit(1);
+}
+
+static void exitWithError(Error E, StringRef Whence = "") {
+  if (E.isA<CGDataError>()) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      exitWithError(IPE.message(), std::string(Whence));
+    });
+    return;
+  }
+
+  exitWithError(toString(std::move(E)), std::string(Whence));
+}
+
+static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") {
+  exitWithError(EC.message(), std::string(Whence));
+}
+
+static int dump_main(int argc, const char *argv[]) {
+  if (Filename == OutputFilename) {
+    errs() << sys::path::filename(argv[0]) << " " << argv[1]
+           << ": Input file name cannot be the same as the output file name!\n";
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC,
+                    OutputFormat == CD_Text ? sys::fs::OF_TextWithCRLF
+                                            : sys::fs::OF_None);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  CodeGenDataWriter Writer;
+  auto Reader = ReaderOrErr->get();
+  if (Reader->hasOutlinedHashTree()) {
+    OutlinedHashTreeRecord Record(Reader->releaseOutlinedHashTree());
+    Writer.addRecord(Record);
+  }
+
+  if (OutputFormat == CD_Text) {
+    if (Error E = Writer.writeText(OS))
+      exitWithError(std::move(E));
+  } else {
+    if (Error E = Writer.write(OS))
+      exitWithError(std::move(E));
+  }
+
+  return 0;
+}
+
+static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
+                         OutlinedHashTreeRecord &GlobalOutlineRecord);
+
+static bool handleArchive(StringRef Filename, Archive &Arch,
+                          OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  bool Result = true;
+  Error Err = Error::success();
+  for (const auto &Child : Arch.children(Err)) {
+    auto BuffOrErr = Child.getMemoryBufferRef();
+    if (Error E = BuffOrErr.takeError())
+      exitWithError(std::move(E), Filename);
+    auto NameOrErr = Child.getName();
+    if (Error E = NameOrErr.takeError())
+      exitWithError(std::move(E), Filename);
+    std::string Name = (Filename + "(" + NameOrErr.get() + ")").str();
+    Result &= handleBuffer(Name, BuffOrErr.get(), GlobalOutlineRecord);
+  }
+  if (Err)
+    exitWithError(std::move(Err), Filename);
+  return Result;
+}
+
+static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
+                         OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  Expected<std::unique_ptr<Binary>> BinOrErr = object::createBinary(Buffer);
+  if (Error E = BinOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  bool Result = true;
+  if (auto *Obj = dyn_cast<ObjectFile>(BinOrErr->get())) {
+    if (Error E =
+            CodeGenDataReader::mergeFromObjectFile(Obj, GlobalOutlineRecord))
+      exitWithError(std::move(E), Filename);
+  } else if (auto *Arch = dyn_cast<Archive>(BinOrErr->get())) {
+    Result &= handleArchive(Filename, *Arch, GlobalOutlineRecord);
+  } else {
+    // TODO: Support for the MachO universal binary format.
+    errs() << "Error: unsupported binary file: " << Filename << "\n";
+    Result = false;
+  }
+
+  return Result;
+}
+
+static bool handleFile(StringRef Filename,
+                       OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = BuffOrErr.getError())
+    exitWithErrorCode(EC, Filename);
+  return handleBuffer(Filename, *BuffOrErr.get(), GlobalOutlineRecord);
+}
+
+static int merge_main(int argc, const char *argv[]) {
+  bool Result = true;
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &Filename : InputFilenames)
+    Result &= handleFile(Filename, GlobalOutlineRecord);
+
+  if (!Result) {
+    errs() << "Error: failed to merge codegen data files.\n";
+    return 1;
+  }
+
+  CodeGenDataWriter Writer;
+  if (!GlobalOutlineRecord.empty())
+    Writer.addRecord(GlobalOutlineRecord);
+
+  std::error_code EC;
+  raw_fd_ostream Output(OutputFilename, EC, sys::fs::OF_None);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  if (auto E = Writer.write(Output))
+    exitWithError(std::move(E));
+
+  return 0;
+}
+
+static int show_main(int argc, const char *argv[]) {
+  if (Filename == OutputFilename) {
+    errs() << sys::path::filename(argv[0]) << " " << argv[1]
+           << ": Input file name cannot be the same as the output file name!\n";
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = ReaderOrErr->get();
+  if (ShowCGDataVersion)
+    OS << "Version: " << Reader->getVersion() << "\n";
+
+  if (Reader->hasOutlinedHashTree()) {
+    auto Tree = Reader->releaseOutlinedHashTree();
+    OS << "Outlined hash tree:\n";
+    OS << "  Total Node Count: " << Tree->size() << "\n";
+    OS << "  Terminal Node Count: " << Tree->size(/*GetTerminalCountOnly=*/true)
+       << "\n";
+    OS << "  Depth: " << Tree->depth() << "\n";
+  }
+
+  return 0;
+}
+
+int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) {
+  const char **argv = const_cast<const char **>(argvNonConst);
+
+  StringRef ProgName(sys::path::filename(argv[0]));
+
+  if (argc < 2) {
+    errs() << ProgName
+           << ": No subcommand specified! Run llvm-cgdata --help for usage.\n";
+    return 1;
+  }
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM codegen data\n");
+
+  if (DumpSubcommand)
+    return dump_main(argc, argv);
+
+  if (MergeSubcommand)
+    return merge_main(argc, argv);
+
+  if (ShowSubcommand)
+    return show_main(argc, argv);
+
+  errs() << ProgName
+         << ": Unknown command. Run llvm-cgdata --help for usage.\n";
+  return 1;
+}

>From 70b54f94e8f20b108d09f12ef00fa918991ba06a Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Wed, 24 Apr 2024 09:40:34 -0700
Subject: [PATCH 3/9] [MachineOutliner][NFC] Refactor

---
 llvm/include/llvm/CodeGen/MachineOutliner.h  |  5 +-
 llvm/include/llvm/CodeGen/TargetInstrInfo.h  | 11 +++-
 llvm/lib/CodeGen/MachineOutliner.cpp         | 53 +++++++++++---------
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp |  7 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.h   |  3 +-
 5 files changed, 46 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index eaba6c9b18f2bb..84937a8b563ac0 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -234,11 +234,11 @@ struct OutlinedFunction {
   unsigned FrameConstructionID = 0;
 
   /// Return the number of candidates for this \p OutlinedFunction.
-  unsigned getOccurrenceCount() const { return Candidates.size(); }
+  virtual unsigned getOccurrenceCount() const { return Candidates.size(); }
 
   /// Return the number of bytes it would take to outline this
   /// function.
-  unsigned getOutliningCost() const {
+  virtual unsigned getOutliningCost() const {
     unsigned CallOverhead = 0;
     for (const Candidate &C : Candidates)
       CallOverhead += C.getCallOverhead();
@@ -272,6 +272,7 @@ struct OutlinedFunction {
   }
 
   OutlinedFunction() = delete;
+  virtual ~OutlinedFunction() = default;
 };
 } // namespace outliner
 } // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index d4a83e3753d980..1e7be312851929 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -2053,13 +2053,20 @@ class TargetInstrInfo : public MCInstrInfo {
 
   /// Returns a \p outliner::OutlinedFunction struct containing target-specific
   /// information for a set of outlining candidates. Returns std::nullopt if the
-  /// candidates are not suitable for outlining.
+  /// candidates are not suitable for outlining. \p MinRep is the minimum
+  /// number of times the instruction sequence must be repeated.
   virtual std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
-      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs,
+      unsigned MipRep) const {
     llvm_unreachable(
         "Target didn't implement TargetInstrInfo::getOutliningCandidateInfo!");
   }
 
+  virtual std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+    return getOutliningCandidateInfo(RepeatedSequenceLocs, /*MipRep=*/2);
+  }
+
   /// Optional target hook to create the LLVM IR attributes for the outlined
   /// function. If overridden, the overriding function must call the default
   /// implementation.
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index dc2f5ef15206e8..68a71b80123081 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -446,8 +446,9 @@ struct MachineOutliner : public ModulePass {
   /// \param Mapper Contains outlining mapping information.
   /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
   /// each type of candidate.
-  void findCandidates(InstructionMapper &Mapper,
-                      std::vector<OutlinedFunction> &FunctionList);
+  void
+  findCandidates(InstructionMapper &Mapper,
+                 std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
 
   /// Replace the sequences of instructions represented by \p OutlinedFunctions
   /// with calls to functions.
@@ -455,7 +456,9 @@ struct MachineOutliner : public ModulePass {
   /// \param M The module we are outlining from.
   /// \param FunctionList A list of functions to be inserted into the module.
   /// \param Mapper Contains the instruction mappings for the module.
-  bool outline(Module &M, std::vector<OutlinedFunction> &FunctionList,
+  /// \param[out] OutlinedFunctionNum The outlined function number.
+  bool outline(Module &M,
+               std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList,
                InstructionMapper &Mapper, unsigned &OutlinedFunctionNum);
 
   /// Creates a function for \p OF and inserts it into the module.
@@ -574,7 +577,8 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
 }
 
 void MachineOutliner::findCandidates(
-    InstructionMapper &Mapper, std::vector<OutlinedFunction> &FunctionList) {
+    InstructionMapper &Mapper,
+    std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
   FunctionList.clear();
   SuffixTree ST(Mapper.UnsignedVec);
 
@@ -674,7 +678,7 @@ void MachineOutliner::findCandidates(
       continue;
     }
 
-    FunctionList.push_back(*OF);
+    FunctionList.push_back(std::make_unique<OutlinedFunction>(*OF));
   }
 }
 
@@ -819,32 +823,31 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   return &MF;
 }
 
-bool MachineOutliner::outline(Module &M,
-                              std::vector<OutlinedFunction> &FunctionList,
-                              InstructionMapper &Mapper,
-                              unsigned &OutlinedFunctionNum) {
+bool MachineOutliner::outline(
+    Module &M, std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList,
+    InstructionMapper &Mapper, unsigned &OutlinedFunctionNum) {
   LLVM_DEBUG(dbgs() << "*** Outlining ***\n");
   LLVM_DEBUG(dbgs() << "NUMBER OF POTENTIAL FUNCTIONS: " << FunctionList.size()
                     << "\n");
   bool OutlinedSomething = false;
 
   // Sort by benefit. The most beneficial functions should be outlined first.
-  stable_sort(FunctionList,
-              [](const OutlinedFunction &LHS, const OutlinedFunction &RHS) {
-                return LHS.getBenefit() > RHS.getBenefit();
-              });
+  stable_sort(FunctionList, [](const std::unique_ptr<OutlinedFunction> &LHS,
+                               const std::unique_ptr<OutlinedFunction> &RHS) {
+    return LHS->getBenefit() > RHS->getBenefit();
+  });
 
   // Walk over each function, outlining them as we go along. Functions are
   // outlined greedily, based off the sort above.
   auto *UnsignedVecBegin = Mapper.UnsignedVec.begin();
   LLVM_DEBUG(dbgs() << "WALKING FUNCTION LIST\n");
-  for (OutlinedFunction &OF : FunctionList) {
+  for (auto &OF : FunctionList) {
 #ifndef NDEBUG
-    auto NumCandidatesBefore = OF.Candidates.size();
+    auto NumCandidatesBefore = OF->Candidates.size();
 #endif
     // If we outlined something that overlapped with a candidate in a previous
     // step, then we can't outline from it.
-    erase_if(OF.Candidates, [&UnsignedVecBegin](Candidate &C) {
+    erase_if(OF->Candidates, [&UnsignedVecBegin](Candidate &C) {
       return std::any_of(UnsignedVecBegin + C.getStartIdx(),
                          UnsignedVecBegin + C.getEndIdx() + 1, [](unsigned I) {
                            return I == static_cast<unsigned>(-1);
@@ -852,36 +855,36 @@ bool MachineOutliner::outline(Module &M,
     });
 
 #ifndef NDEBUG
-    auto NumCandidatesAfter = OF.Candidates.size();
+    auto NumCandidatesAfter = OF->Candidates.size();
     LLVM_DEBUG(dbgs() << "PRUNED: " << NumCandidatesBefore - NumCandidatesAfter
                       << "/" << NumCandidatesBefore << " candidates\n");
 #endif
 
     // If we made it unbeneficial to outline this function, skip it.
-    if (OF.getBenefit() < OutlinerBenefitThreshold) {
-      LLVM_DEBUG(dbgs() << "SKIP: Expected benefit (" << OF.getBenefit()
+    if (OF->getBenefit() < OutlinerBenefitThreshold) {
+      LLVM_DEBUG(dbgs() << "SKIP: Expected benefit (" << OF->getBenefit()
                         << " B) < threshold (" << OutlinerBenefitThreshold
                         << " B)\n");
       continue;
     }
 
-    LLVM_DEBUG(dbgs() << "OUTLINE: Expected benefit (" << OF.getBenefit()
+    LLVM_DEBUG(dbgs() << "OUTLINE: Expected benefit (" << OF->getBenefit()
                       << " B) > threshold (" << OutlinerBenefitThreshold
                       << " B)\n");
 
     // It's beneficial. Create the function and outline its sequence's
     // occurrences.
-    OF.MF = createOutlinedFunction(M, OF, Mapper, OutlinedFunctionNum);
-    emitOutlinedFunctionRemark(OF);
+    OF->MF = createOutlinedFunction(M, *OF, Mapper, OutlinedFunctionNum);
+    emitOutlinedFunctionRemark(*OF);
     FunctionsCreated++;
     OutlinedFunctionNum++; // Created a function, move to the next name.
-    MachineFunction *MF = OF.MF;
+    MachineFunction *MF = OF->MF;
     const TargetSubtargetInfo &STI = MF->getSubtarget();
     const TargetInstrInfo &TII = *STI.getInstrInfo();
 
     // Replace occurrences of the sequence with calls to the new function.
     LLVM_DEBUG(dbgs() << "CREATE OUTLINED CALLS\n");
-    for (Candidate &C : OF.Candidates) {
+    for (Candidate &C : OF->Candidates) {
       MachineBasicBlock &MBB = *C.getMBB();
       MachineBasicBlock::iterator StartIt = C.begin();
       MachineBasicBlock::iterator EndIt = std::prev(C.end());
@@ -1173,7 +1176,7 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
 
   // Prepare instruction mappings for the suffix tree.
   populateMapper(Mapper, M, MMI);
-  std::vector<OutlinedFunction> FunctionList;
+  std::vector<std::unique_ptr<OutlinedFunction>> FunctionList;
 
   // Find all of the outlining candidates.
   findCandidates(Mapper, FunctionList);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 9518d573bccdd1..47783e11099688 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8240,7 +8240,8 @@ static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
 
 std::optional<outliner::OutlinedFunction>
 AArch64InstrInfo::getOutliningCandidateInfo(
-    std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+    std::vector<outliner::Candidate> &RepeatedSequenceLocs,
+    unsigned MinRep) const {
   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
 
   unsigned SequenceSize = 0;
@@ -8354,7 +8355,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     llvm::erase_if(RepeatedSequenceLocs, hasIllegalSPModification);
 
     // If the sequence doesn't have enough candidates left, then we're done.
-    if (RepeatedSequenceLocs.size() < 2)
+    if (RepeatedSequenceLocs.size() < MinRep)
       return std::nullopt;
   }
 
@@ -8598,7 +8599,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     }
 
     // If we dropped all of the candidates, bail out here.
-    if (RepeatedSequenceLocs.size() < 2) {
+    if (RepeatedSequenceLocs.size() < MinRep) {
       RepeatedSequenceLocs.clear();
       return std::nullopt;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 9a2914891675c5..f0eccd541c225a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -463,7 +463,8 @@ class AArch64InstrInfo final : public AArch64GenInstrInfo {
   bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
                                    bool OutlineFromLinkOnceODRs) const override;
   std::optional<outliner::OutlinedFunction> getOutliningCandidateInfo(
-      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs,
+      unsigned MinRep) const override;
   void mergeOutliningCandidateAttributes(
       Function &F, std::vector<outliner::Candidate> &Candidates) const override;
   outliner::InstrType

>From 80df5a222f1130a6cce2e995cd2c7c7ebf31a87f Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Wed, 24 Apr 2024 11:26:23 -0700
Subject: [PATCH 4/9] [MachineOutliner][CGData] Global Outlining

This commit introduces support for outlining functions across modules using codegen data generated from previous codegen. The codegen data currently manages the outlined hash tree, which records outlining instances that occurred locally in the past.

The machine outliner now operates in one of three modes:
1. CGDataMode::None: This is the default outliner mode that uses the suffix tree to identify (local) outlining candidates within a module. This mode is also used by (full)LTO to maintain optimal behavior with the combined module.
2. CGDataMode::Write (`codegen-data-generate`): This mode is identical to the default mode, but it also publishes the stable hash sequences of instructions in the outlined functions into a local outlined hash tree. It then encodes this into the `__llvm_outline` section, which will be dead-stripped at link time.
3. CGDataMode::Read (`codegen-data-use-path={.cgdata}`): This mode reads a codegen data file (.cgdata) and initializes a global outlined hash tree. This tree is used to generate global outlining candidates. Note that the codegen data file has been post-processed with the raw `__llvm_outline` sections from all native objects using the `llvm-cgdata` tool (or a linker, `LLD`, or a new ThinLTO pipeline later).
---
 llvm/include/llvm/CodeGen/MachineOutliner.h   |  39 +++
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 llvm/lib/CodeGen/MachineOutliner.cpp          | 242 +++++++++++++++++-
 llvm/lib/CodeGen/MachineStableHash.cpp        |  14 +-
 llvm/lib/CodeGenData/CodeGenData.cpp          |  26 +-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 .../CodeGen/AArch64/cgdata-global-hash.ll     |  40 +++
 .../AArch64/cgdata-read-double-outline.ll     |  57 +++++
 .../AArch64/cgdata-read-lto-outline.ll        |  94 +++++++
 .../CodeGen/AArch64/cgdata-read-priority.ll   |  68 +++++
 .../AArch64/cgdata-read-single-outline.ll     |  42 +++
 .../CodeGen/AArch64/cgdata-write-outline.ll   |  51 ++++
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 +
 13 files changed, 671 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
 create mode 100644 llvm/test/CodeGen/AArch64/cgdata-write-outline.ll

diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index 84937a8b563ac0..5a8bae744ed9ab 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineStableHash.h"
 #include <initializer_list>
 
 namespace llvm {
@@ -233,6 +234,9 @@ struct OutlinedFunction {
   /// Target-defined identifier for constructing a frame for this function.
   unsigned FrameConstructionID = 0;
 
+  /// The sequence of stable_hash'es of instructions.
+  std::vector<stable_hash> OutlinedHashSequence;
+
   /// Return the number of candidates for this \p OutlinedFunction.
   virtual unsigned getOccurrenceCount() const { return Candidates.size(); }
 
@@ -274,6 +278,41 @@ struct OutlinedFunction {
   OutlinedFunction() = delete;
   virtual ~OutlinedFunction() = default;
 };
+
+/// The information necessary to create an outlined function that is matched
+/// globally.
+struct GlobalOutlinedFunction : public OutlinedFunction {
+  GlobalOutlinedFunction(OutlinedFunction &OF, unsigned GlobalOccurrenceCount)
+      : OutlinedFunction(OF.Candidates, OF.SequenceSize, OF.FrameOverhead,
+                         OF.FrameConstructionID),
+        GlobalOccurrenceCount(GlobalOccurrenceCount) {}
+
+  unsigned GlobalOccurrenceCount;
+
+  /// Return the number of times that appear globally.
+  /// Global outlining candidate is uniquely created per each match, but this
+  /// might be erased out when it's overlapped with the previous outlining
+  /// instance.
+  unsigned getOccurrenceCount() const override {
+    assert(Candidates.size() <= 1);
+    return Candidates.empty() ? 0 : GlobalOccurrenceCount;
+  }
+
+  /// Return the outlining cost using the global occurrence count
+  /// with the same cost as the first (unique) candidate.
+  unsigned getOutliningCost() const override {
+    assert(Candidates.size() <= 1);
+    unsigned CallOverhead =
+        Candidates.empty()
+            ? 0
+            : Candidates[0].getCallOverhead() * getOccurrenceCount();
+    return CallOverhead + SequenceSize + FrameOverhead;
+  }
+
+  GlobalOutlinedFunction() = delete;
+  ~GlobalOutlinedFunction() = default;
+};
+
 } // namespace outliner
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 2c24de60edd43e..145442f0ce3e2c 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -266,6 +266,7 @@ add_llvm_component_library(LLVMCodeGen
   Analysis
   BitReader
   BitWriter
+  CodeGenData
   CodeGenTypes
   Core
   MC
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 68a71b80123081..0a8e2850ca9f43 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -59,6 +59,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -66,6 +67,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGenData/CodeGenDataReader.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Mangler.h"
@@ -74,6 +76,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SuffixTree.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <functional>
 #include <tuple>
 #include <vector>
@@ -121,6 +124,12 @@ static cl::opt<unsigned> OutlinerBenefitThreshold(
     cl::desc(
         "The minimum size in bytes before an outlining candidate is accepted"));
 
+static cl::opt<bool>
+    DisableGlobalOutlining("disable-global-outlining", cl::Hidden,
+                           cl::desc("Disable global outlining only by ignoring "
+                                    "the codegen data generation or use"),
+                           cl::init(false));
+
 namespace {
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
@@ -411,11 +420,32 @@ struct MachineOutliner : public ModulePass {
   /// Set when the pass is constructed in TargetPassConfig.
   bool RunOnAllFunctions = true;
 
+  /// This is a compact representation of hash sequences of outlined functions.
+  /// It is used when OutlinerMode = CGDataMode::Write.
+  /// The resulting hash tree will be emitted into __llvm_outlined section
+  /// which will be dead-stripped not going to the final binary.
+  /// A post-process using llvm-cgdata, lld, or ThinLTO can merge them into
+  /// a global oulined hash tree for the subsequent codegen.
+  std::unique_ptr<OutlinedHashTree> LocalHashTree;
+
+  /// The combined index to check a LTO mode.
+  const ModuleSummaryIndex *TheIndex = nullptr;
+
+  /// The mode of the outliner.
+  /// When is's CGDataMode::None, candidates are populated with the suffix tree
+  /// within a module and outlined.
+  /// When it's CGDataMode::Write, in addition to CGDataMode::None, the hash
+  /// sequences of outlined functions are published into LocalHashTree.
+  /// When it's CGDataMode::Read, candidates are populated with the global
+  /// outlined hash tree that has been built by the previous codegen.
+  CGDataMode OutlinerMode = CGDataMode::None;
+
   StringRef getPassName() const override { return "Machine Outliner"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.addRequired<ImmutableModuleSummaryIndexWrapperPass>();
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
@@ -450,6 +480,16 @@ struct MachineOutliner : public ModulePass {
   findCandidates(InstructionMapper &Mapper,
                  std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
 
+  /// Find all repeated substrings that match in the global outlined hash
+  /// tree built from the previous codegen.
+  ///
+  /// \param Mapper Contains outlining mapping information.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions
+  /// each type of candidate.
+  void findGlobalCandidates(
+      InstructionMapper &Mapper,
+      std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList);
+
   /// Replace the sequences of instructions represented by \p OutlinedFunctions
   /// with calls to functions.
   ///
@@ -466,6 +506,12 @@ struct MachineOutliner : public ModulePass {
                                           InstructionMapper &Mapper,
                                           unsigned Name);
 
+  /// Initialize the outliner mode.
+  void initializeOutlinerMode(const Module &M);
+
+  /// Emit the outlined hash tree into __llvm_outline section.
+  void emitOutlinedHashTree(Module &M);
+
   /// Calls 'doOutline()' 1 + OutlinerReruns times.
   bool runOnModule(Module &M) override;
 
@@ -576,6 +622,134 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
   MORE.emit(R);
 }
 
+struct MatchedEntry {
+  size_t StartIdx;
+  size_t Length;
+  size_t Count;
+};
+
+static const HashNode *followHashNode(stable_hash StableHash,
+                                      const HashNode *Current) {
+  auto I = Current->Successors.find(StableHash);
+  return (I == Current->Successors.end()) ? nullptr : I->second.get();
+}
+
+static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
+
+  auto &InstrList = Mapper.InstrList;
+  auto &UnsignedVec = Mapper.UnsignedVec;
+
+  std::vector<MatchedEntry> MatchedEntries;
+  std::vector<stable_hash> Sequence;
+  auto Size = UnsignedVec.size();
+
+  // Get the global outlined hash tree built from the previous run.
+  assert(cgdata::hasOutlinedHashTree());
+  const auto *RootNode = cgdata::getOutlinedHashTree()->getRoot();
+
+  // Find all matches in the global outlined hash tree.
+  // It's quadratic complexity in theory, but it's nearly linear in practice
+  // since the length of outlined candidates are small within a block.
+  for (size_t I = 0; I < Size; I++) {
+    if (UnsignedVec[I] >= Size)
+      continue;
+
+    const MachineInstr &MI = *InstrList[I];
+    stable_hash StableHashI = stableHashValue(MI);
+    if (!StableHashI)
+      continue;
+
+    Sequence.clear();
+    Sequence.push_back(StableHashI);
+
+    const HashNode *LastNode = followHashNode(StableHashI, RootNode);
+    if (!LastNode)
+      continue;
+
+    size_t J = I + 1;
+    for (; J < Size; J++) {
+      // Break on invalid code
+      if (UnsignedVec[J] >= Size)
+        break;
+
+      const MachineInstr &MJ = *InstrList[J];
+      stable_hash StableHashJ = stableHashValue(MJ);
+      // Break on invalid stable hash
+      if (!StableHashJ)
+        break;
+
+      LastNode = followHashNode(StableHashJ, LastNode);
+      if (!LastNode)
+        break;
+
+      // Even with a match ending with a terminal, we continue finding
+      // matches to populate all candidates.
+      Sequence.push_back(StableHashJ);
+      size_t Count = LastNode->Terminals;
+      if (Count)
+        MatchedEntries.push_back({I, J - I + 1, Count});
+    }
+  }
+
+  return MatchedEntries;
+}
+
+static std::vector<stable_hash>
+stableHashMachineInstrs(const MachineBasicBlock::iterator &Begin,
+                        const MachineBasicBlock::iterator &End) {
+  std::vector<stable_hash> Sequence;
+  for (auto I = Begin; I != End; I++) {
+    const MachineInstr &MI = *I;
+    if (MI.isDebugInstr())
+      continue;
+    stable_hash Hash = stableHashValue(MI);
+    // if (!Hash)
+    //  continue;
+    Sequence.push_back(Hash);
+  }
+  return Sequence;
+}
+
+// Save hash sequence of candidates for global function outlining.
+static void
+saveHashSequence(std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
+  for (auto &OF : FunctionList) {
+    auto &C = OF->Candidates.front();
+    OF->OutlinedHashSequence = stableHashMachineInstrs(C.begin(), C.end());
+  }
+}
+
+void MachineOutliner::findGlobalCandidates(
+    InstructionMapper &Mapper,
+    std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
+  FunctionList.clear();
+  auto &InstrList = Mapper.InstrList;
+  auto &MBBFlagsMap = Mapper.MBBFlagsMap;
+
+  std::vector<Candidate> CandidatesForRepeatedSeq;
+  for (auto &ME : getMatchedEntries(Mapper)) {
+    CandidatesForRepeatedSeq.clear();
+    MachineBasicBlock::iterator StartIt = InstrList[ME.StartIdx];
+    MachineBasicBlock::iterator EndIt = InstrList[ME.StartIdx + ME.Length - 1];
+    MachineBasicBlock *MBB = StartIt->getParent();
+    Candidate C(ME.StartIdx, ME.Length, StartIt, EndIt, MBB,
+                FunctionList.size(), MBBFlagsMap[MBB]);
+    CandidatesForRepeatedSeq.push_back(C);
+    const TargetInstrInfo *TII = C.getMF()->getSubtarget().getInstrInfo();
+    std::optional<OutlinedFunction> OF =
+        TII->getOutliningCandidateInfo(CandidatesForRepeatedSeq, /*MinRep*/ 1);
+    if (!OF || OF->Candidates.empty())
+      continue;
+    // We create a global candidate each match.
+    assert(OF->Candidates.size() == 1);
+
+    FunctionList.push_back(
+        std::make_unique<GlobalOutlinedFunction>(*OF, ME.Count));
+  }
+  assert(OutlinerMode == CGDataMode::Read);
+  saveHashSequence(FunctionList);
+}
+
 void MachineOutliner::findCandidates(
     InstructionMapper &Mapper,
     std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
@@ -680,6 +854,9 @@ void MachineOutliner::findCandidates(
 
     FunctionList.push_back(std::make_unique<OutlinedFunction>(*OF));
   }
+  assert(OutlinerMode != CGDataMode::Read);
+  if (OutlinerMode == CGDataMode::Write)
+    saveHashSequence(FunctionList);
 }
 
 MachineFunction *MachineOutliner::createOutlinedFunction(
@@ -977,6 +1154,10 @@ bool MachineOutliner::outline(
       // Statistics.
       NumOutlined++;
     }
+    if (OutlinerMode == CGDataMode::Write) {
+      unsigned Count = OF->Candidates.size();
+      LocalHashTree->insert({OF->OutlinedHashSequence, Count});
+    }
   }
 
   LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";);
@@ -1124,12 +1305,65 @@ void MachineOutliner::emitInstrCountChangedRemark(
   }
 }
 
+void MachineOutliner::initializeOutlinerMode(const Module &M) {
+  if (DisableGlobalOutlining)
+    return;
+
+  if (auto *IndexWrapperPass =
+          getAnalysisIfAvailable<ImmutableModuleSummaryIndexWrapperPass>())
+    TheIndex = IndexWrapperPass->getIndex();
+
+  // (Full)LTO module does not have functions added to the index.
+  // In this case, we run the outliner without using codegen data as usual.
+  if (TheIndex && !TheIndex->hasExportedFunctions(M))
+    return;
+
+  // When codegen data write is enabled, we want to write the local outlined
+  // hash tree to the custom section, `__llvm_outline`.
+  // When the outlined hash tree is available from the previous codegen data,
+  // we want to read it to optimistically create global outlining candidates.
+  if (cgdata::emitCGData()) {
+    OutlinerMode = CGDataMode::Write;
+    // Create a local outlined hash tree to be published.
+    LocalHashTree.reset(new OutlinedHashTree());
+    // We don't need to read the outlined hash tree from the previous codegen
+  } else if (cgdata::hasOutlinedHashTree())
+    OutlinerMode = CGDataMode::Read;
+}
+
+void MachineOutliner::emitOutlinedHashTree(Module &M) {
+  assert(LocalHashTree);
+  if (!LocalHashTree->empty()) {
+    LLVM_DEBUG({
+      dbgs() << "Emit outlined hash tree. Size: " << LocalHashTree->size()
+             << "\n";
+    });
+    SmallVector<char> Buf;
+    raw_svector_ostream OS(Buf);
+
+    OutlinedHashTreeRecord HTR(std::move(LocalHashTree));
+    HTR.serialize(OS);
+
+    llvm::StringRef Data(Buf.data(), Buf.size());
+    std::unique_ptr<MemoryBuffer> Buffer =
+        MemoryBuffer::getMemBuffer(Data, "in-memory outlined hash tree", false);
+
+    Triple TT(M.getTargetTriple());
+    embedBufferInModule(
+        M, *Buffer.get(),
+        getCodeGenDataSectionName(CG_outline, TT.getObjectFormat()));
+  }
+}
+
 bool MachineOutliner::runOnModule(Module &M) {
   // Check if there's anything in the module. If it's empty, then there's
   // nothing to outline.
   if (M.empty())
     return false;
 
+  // Initialize the outliner mode.
+  initializeOutlinerMode(M);
+
   // Number to append to the current outlined function.
   unsigned OutlinedFunctionNum = 0;
 
@@ -1149,6 +1383,9 @@ bool MachineOutliner::runOnModule(Module &M) {
     }
   }
 
+  if (OutlinerMode == CGDataMode::Write)
+    emitOutlinedHashTree(M);
+
   return true;
 }
 
@@ -1179,7 +1416,10 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   std::vector<std::unique_ptr<OutlinedFunction>> FunctionList;
 
   // Find all of the outlining candidates.
-  findCandidates(Mapper, FunctionList);
+  if (OutlinerMode == CGDataMode::Read)
+    findGlobalCandidates(Mapper, FunctionList);
+  else
+    findCandidates(Mapper, FunctionList);
 
   // If we've requested size remarks, then collect the MI counts of every
   // function before outlining, and the MI counts after outlining.
diff --git a/llvm/lib/CodeGen/MachineStableHash.cpp b/llvm/lib/CodeGen/MachineStableHash.cpp
index 5abfbd5981fba8..33906a3374812b 100644
--- a/llvm/lib/CodeGen/MachineStableHash.cpp
+++ b/llvm/lib/CodeGen/MachineStableHash.cpp
@@ -94,9 +94,17 @@ stable_hash llvm::stableHashValue(const MachineOperand &MO) {
   case MachineOperand::MO_Metadata:
     StableHashBailingMetadataUnsupported++;
     return 0;
-  case MachineOperand::MO_GlobalAddress:
-    StableHashBailingGlobalAddress++;
-    return 0;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    if (GV->hasPrivateLinkage() || !GV->hasName()) {
+      StableHashBailingGlobalAddress++;
+      return 0;
+    }
+    auto Name = GV->getName();
+    return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
+                               stable_hash_combine_string(Name),
+                               MO.getOffset());
+  }
   case MachineOperand::MO_TargetIndex: {
     if (const char *Name = MO.getTargetIndexName())
       return stable_hash_combine(MO.getType(), MO.getTargetFlags(),
diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp
index 3bd21c97c7de7a..841af8a347eeb2 100644
--- a/llvm/lib/CodeGenData/CodeGenData.cpp
+++ b/llvm/lib/CodeGenData/CodeGenData.cpp
@@ -24,6 +24,13 @@
 using namespace llvm;
 using namespace cgdata;
 
+cl::opt<bool>
+    CodeGenDataGenerate("codegen-data-generate", cl::init(false), cl::Hidden,
+                        cl::desc("Emit CodeGen Data into custom sections"));
+cl::opt<std::string>
+    CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
+                       cl::desc("File path to where .cgdata file is read"));
+
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
   std::string Msg;
@@ -133,7 +140,24 @@ CodeGenData &CodeGenData::getInstance() {
     auto *CGD = new CodeGenData();
     Instance.reset(CGD);
 
-    // TODO: Initialize writer or reader mode for the client optimization.
+    if (CodeGenDataGenerate)
+      CGD->EmitCGData = true;
+    else if (!CodeGenDataUsePath.empty()) {
+      // Initialize the global CGData if the input file name is given.
+      // We do not error-out when failing to parse the input file.
+      // Instead, just emit an warning message and fall back as if no CGData
+      // were available.
+      auto FS = vfs::getRealFileSystem();
+      auto ReaderOrErr = CodeGenDataReader::create(CodeGenDataUsePath, *FS);
+      if (Error E = ReaderOrErr.takeError()) {
+        warn(std::move(E), CodeGenDataUsePath);
+        return;
+      }
+      // Publish each CGData based on the data type in the header.
+      auto Reader = ReaderOrErr->get();
+      if (Reader->hasOutlinedHashTree())
+        CGD->publishOutlinedHashTree(Reader->releaseOutlinedHashTree());
+    }
   });
   return *(Instance.get());
 }
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index d3c8e3b7e805c1..391d63d2ceeaaf 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -16,6 +16,7 @@
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
+; CHECK-NEXT: Module summary info
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager
diff --git a/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
new file mode 100644
index 00000000000000..09eb639ab40ef5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-global-hash.ll
@@ -0,0 +1,40 @@
+; This test verifies the stable hash values for different global variables
+; that have distinct names.
+; We generate two different cgdata files from nearly identical outline instances,
+; with the only difference being the last call target globals, @g vs @h.
+
+; RUN: split-file %s %t
+
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-g.ll -o %t/local-g.o
+; RUN: llvm-cgdata merge %t/local-g.o -o %t/local-g.cgdata
+; RUN: llvm-cgdata dump %t/local-g.cgdata -o %t/local-g.cgtext
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-h.ll -o %t/local-h.o
+; RUN: llvm-cgdata merge %t/local-h.o -o %t/local-h.cgdata
+; RUN: llvm-cgdata dump %t/local-h.cgdata -o %t/local-h.cgtext
+
+; We compare the trees which are only different at the terminal node's hash value.
+; Here we simply count the different lines that have `Hash` string.
+; RUN: not diff %t/local-g.cgtext %t/local-h.cgtext 2>&1 | grep Hash | wc -l | FileCheck %s
+; CHECK: 2
+
+;--- local-g.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-h.ll
+declare i32 @h(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @h(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @h(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
new file mode 100644
index 00000000000000..49f417e9d01294
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-double-outline.ll
@@ -0,0 +1,57 @@
+; This test demonstrates how identical instruction sequences are handled during global outlining.
+; Currently, we do not attempt to share an outlined function for identical sequences.
+; Instead, each instruction sequence that matches against the global outlined hash tree
+; is outlined into its own unique function.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata for local-two-another.ll and proceed to optimistically outline
+; each instruction sequence that matches against the global outlined hash tree.
+; Since each matching sequence is considered a candidate, we expect to generate two
+; unique outlined functions. These functions, although unique, will be identical in code,
+; and thus, will be folded by the linker.
+
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-two-another.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-two-another.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
new file mode 100644
index 00000000000000..e7b49f422f118a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-lto-outline.ll
@@ -0,0 +1,94 @@
+; This test is similar to cgdata-read-double-outline.ll, but it is executed with LTO (Link Time Optimization).
+; It demonstrates how identical instruction sequences are handled during global outlining.
+; Currently, we do not attempt to reuse an outlined function for identical sequences.
+; Instead, each instruction sequence that appears in the global outlined hash tree
+; is outlined into its own unique function.
+
+; RUN: split-file %s %t
+
+; We first create the cgdata file from a local outline instance in local-two.ll
+; RUN: opt -module-summary %t/local-two.ll -o %t/write.bc
+; RUN: llvm-lto2 run %t/write.bc -o %t/write \
+; RUN:  -r %t/write.bc,_f1,px -r %t/write.bc,_f2,px -r %t/write.bc,_g,p \
+; RUN:  -codegen-data-generate=true
+; RUN: llvm-cgdata merge %t/write.1 -o %t_cgdata
+; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we execute either ThinLTO or LTO by reading the cgdata for local-two-another.ll.
+; With ThinLTO, similar to the no-LTO scenario shown in cgdata-read-double-outline.ll,
+; it optimistically outlines each instruction sequence that matches against
+; the global outlined hash tree. Since each matching sequence is considered a candidate,
+; we expect to generate two unique outlined functions that will be folded
+; by the linker at a later stage.
+; However, with LTO, we do not utilize the cgdata, but instead fall back to the default
+; outliner mode. This results in a single outlined function that is
+; shared across two call-sites.
+
+; Run ThinLTO
+; RUN: opt -module-summary %t/local-two-another.ll -o %t/thinlto.bc
+; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto \
+; RUN:  -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \
+; RUN:  -codegen-data-use-path=%t_cgdata
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+; CHECK: _OUTLINED_FUNCTION_{{.*}}:
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+; Run ThinLTO while disabling the global outliner.
+; We have a single outlined case with the default outliner.
+; RUN: llvm-lto2 run %t/thinlto.bc -o %t/thinlto-disable \
+; RUN:  -r %t/thinlto.bc,_f3,px -r %t/thinlto.bc,_f4,px -r %t/thinlto.bc,_g,p \
+; RUN:  -codegen-data-use-path=%t_cgdata \
+; RUN:  -disable-global-outlining
+; RUN: llvm-objdump -d %t/thinlto-disable.1 | FileCheck %s --check-prefix=DISABLE
+
+; DISABLE: _OUTLINED_FUNCTION_{{.*}}:
+; DISABLE-NEXT:  mov
+; DISABLE-NEXT:  mov
+; DISABLE-NEXT:  b
+; DISABLE-NOT: _OUTLINED_FUNCTION_{{.*}}:
+
+; Run LTO, which effectively disables the global outliner.
+; RUN: opt %t/local-two-another.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/lto.bc -o %t/lto \
+; RUN:  -r %t/lto.bc,_f3,px -r %t/lto.bc,_f4,px -r %t/lto.bc,_g,p \
+; RUN:  -codegen-data-use-path=%t_cgdata
+; RUN: llvm-objdump -d %t/lto.0 | FileCheck %s --check-prefix=DISABLE
+
+;--- local-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-two-another.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
new file mode 100644
index 00000000000000..642beb46915d73
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-priority.ll
@@ -0,0 +1,68 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+; When multiple matches occur, we prioritize the candidates using the global frequency.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from local outline instances present in write1.ll and write2.ll
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/write1.ll -o %t_write1
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/write2.ll -o %t_write2
+; RUN: llvm-cgdata merge %t_write1 %t_write2 -o %t_cgdata
+; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 8
+; SHOW-NEXT:  Terminal Node Count: 2
+; SHOW-NEXT:  Depth: 4
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in read.ll that matches against the cgdata.
+; There are two matches -- (1) (mov #1, mov #2, mov #3, b) and (2) (mov #2, mov #3, b).
+; Even though sequence (1) is longer than sequence (2), the latter is outlined because it occurs more frequently in the outlined hash tree.
+
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/read.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- write1.ll
+; The sequence (mov #2, mov #3, b) are repeated 4 times.
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 50, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 60, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 70, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 40, i32 80, i32 2, i32 3);
+  ret i32 %1
+}
+
+;--- write2.ll
+; The sequence (mov #1, mov #2, mov #3, b) are repeated 2 times.
+declare i32 @g(i32, i32, i32)
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
+define i32 @f7() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
+
+;--- read.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2, i32 3);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
new file mode 100644
index 00000000000000..2c606a5a13007e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-read-single-outline.ll
@@ -0,0 +1,42 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in local-one.ll that matches against the cgdata.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll
new file mode 100644
index 00000000000000..0527ec1434ba09
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/cgdata-write-outline.ll
@@ -0,0 +1,51 @@
+; This test verifies whether an outlined function is encoded into the __llvm_outline section
+; when the -codegen-data-generate flag is used.
+
+; Verify whether an outlined function is always created, but only encoded into the section when the flag is used.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %s -o %t_save
+; RUN: llvm-objdump -d %t_save | FileCheck %s
+; RUN: llvm-objdump -h %t_save | FileCheck %s --check-prefix=SECTNAME
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=false -filetype=obj %s -o %t_nosave
+; RUN: llvm-objdump -d  %t_nosave | FileCheck %s
+; RUN: llvm-objdump -h %t_nosave | FileCheck %s --check-prefix=NOSECTNAME
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+; SECTNAME: __llvm_outline
+; NOSECTNAME-NOT: __llvm_outline
+
+; Verify the content of cgdata after it has been processed with llvm-cgdata.
+; RUN: llvm-cgdata merge %t_save -o %t_cgdata
+; RUN: llvm-cgdata dump %t_cgdata | FileCheck %s --check-prefix=TREE
+
+; TREE: :outlined_hash_tree
+; TREE: ---
+; TREE-NEXT: 0:
+; TREE-NEXT:   Hash:            0x0
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 1 ]
+; TREE-NEXT: 1:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 2 ]
+; TREE-NEXT: 2:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       0
+; TREE-NEXT:   SuccessorIds:    [ 3 ]
+; TREE-NEXT: 3:
+; TREE-NEXT:   Hash:            {{.}}
+; TREE-NEXT:   Terminals:       2
+; TREE-NEXT:   SuccessorIds:    [  ]
+; TREE-NEXT: ...
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index 90472f246918f3..f02425d53b3ee9 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -20,6 +20,7 @@
 ; CHECK-NEXT: Machine Branch Probability Analysis
 ; CHECK-NEXT: Default Regalloc Eviction Advisor
 ; CHECK-NEXT: Default Regalloc Priority Advisor
+; CHECK-NEXT: Module summary info
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Pre-ISel Intrinsic Lowering
 ; CHECK-NEXT:     FunctionPass Manager

>From 74b8d8a06bfbd7b7077ee5b22e4768552e7a0166 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Thu, 25 Apr 2024 14:08:39 -0700
Subject: [PATCH 5/9] [MachineOutliner] CFI clean up. Will be folded into the
 prior commit

---
 llvm/include/llvm/CodeGen/MachineOutliner.h |  3 -
 llvm/lib/CodeGen/MachineOutliner.cpp        | 64 ++++++++-------------
 2 files changed, 23 insertions(+), 44 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index 5a8bae744ed9ab..3bda86f399b7b5 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -234,9 +234,6 @@ struct OutlinedFunction {
   /// Target-defined identifier for constructing a frame for this function.
   unsigned FrameConstructionID = 0;
 
-  /// The sequence of stable_hash'es of instructions.
-  std::vector<stable_hash> OutlinedHashSequence;
-
   /// Return the number of candidates for this \p OutlinedFunction.
   virtual unsigned getOccurrenceCount() const { return Candidates.size(); }
 
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 0a8e2850ca9f43..d6238f245c1c80 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -655,10 +655,12 @@ static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
       continue;
 
     const MachineInstr &MI = *InstrList[I];
-    stable_hash StableHashI = stableHashValue(MI);
-    if (!StableHashI)
+    // We optimistically skip Debug instructions and CFI instructions.
+    // Debug instructions will be deleted in the outlined function.
+    // CFI instructions are adapted to the outlined function.
+    if (MI.isDebugInstr() || MI.isCFIInstruction())
       continue;
-
+    stable_hash StableHashI = stableHashValue(MI);
     Sequence.clear();
     Sequence.push_back(StableHashI);
 
@@ -673,11 +675,10 @@ static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
         break;
 
       const MachineInstr &MJ = *InstrList[J];
-      stable_hash StableHashJ = stableHashValue(MJ);
-      // Break on invalid stable hash
-      if (!StableHashJ)
-        break;
+      if (MJ.isDebugInstr() || MJ.isCFIInstruction())
+        continue;
 
+      stable_hash StableHashJ = stableHashValue(MJ);
       LastNode = followHashNode(StableHashJ, LastNode);
       if (!LastNode)
         break;
@@ -694,31 +695,6 @@ static std::vector<MatchedEntry> getMatchedEntries(InstructionMapper &Mapper) {
   return MatchedEntries;
 }
 
-static std::vector<stable_hash>
-stableHashMachineInstrs(const MachineBasicBlock::iterator &Begin,
-                        const MachineBasicBlock::iterator &End) {
-  std::vector<stable_hash> Sequence;
-  for (auto I = Begin; I != End; I++) {
-    const MachineInstr &MI = *I;
-    if (MI.isDebugInstr())
-      continue;
-    stable_hash Hash = stableHashValue(MI);
-    // if (!Hash)
-    //  continue;
-    Sequence.push_back(Hash);
-  }
-  return Sequence;
-}
-
-// Save hash sequence of candidates for global function outlining.
-static void
-saveHashSequence(std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
-  for (auto &OF : FunctionList) {
-    auto &C = OF->Candidates.front();
-    OF->OutlinedHashSequence = stableHashMachineInstrs(C.begin(), C.end());
-  }
-}
-
 void MachineOutliner::findGlobalCandidates(
     InstructionMapper &Mapper,
     std::vector<std::unique_ptr<OutlinedFunction>> &FunctionList) {
@@ -746,8 +722,6 @@ void MachineOutliner::findGlobalCandidates(
     FunctionList.push_back(
         std::make_unique<GlobalOutlinedFunction>(*OF, ME.Count));
   }
-  assert(OutlinerMode == CGDataMode::Read);
-  saveHashSequence(FunctionList);
 }
 
 void MachineOutliner::findCandidates(
@@ -854,9 +828,6 @@ void MachineOutliner::findCandidates(
 
     FunctionList.push_back(std::make_unique<OutlinedFunction>(*OF));
   }
-  assert(OutlinerMode != CGDataMode::Read);
-  if (OutlinerMode == CGDataMode::Write)
-    saveHashSequence(FunctionList);
 }
 
 MachineFunction *MachineOutliner::createOutlinedFunction(
@@ -916,6 +887,7 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   MachineFunction *OriginalMF = FirstCand.front().getMF();
   const std::vector<MCCFIInstruction> &Instrs =
       OriginalMF->getFrameInstructions();
+  std::vector<stable_hash> OutlinedHashSequence;
   for (auto &MI : FirstCand) {
     if (MI.isDebugInstr())
       continue;
@@ -932,9 +904,23 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
       NewMI->dropMemRefs(MF);
       NewMI->setDebugLoc(DL);
       MBB.insert(MBB.end(), NewMI);
+      // For non-debug and non-cfi instructions, compute stable hash sequence.
+      if (OutlinerMode != CGDataMode::None) {
+        stable_hash Hash = stableHashValue(MI);
+        OutlinedHashSequence.push_back(Hash);
+      }
     }
   }
 
+  // TODO: Update function name based on the hash sequence.
+
+  // Publish the hash sequence to the local hash tree.
+  if (OutlinerMode == CGDataMode::Write) {
+    assert(!OutlinedHashSequence.empty());
+    unsigned Count = OF.Candidates.size();
+    LocalHashTree->insert({OutlinedHashSequence, Count});
+  }
+
   // Set normal properties for a late MachineFunction.
   MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
   MF.getProperties().set(MachineFunctionProperties::Property::NoPHIs);
@@ -1154,10 +1140,6 @@ bool MachineOutliner::outline(
       // Statistics.
       NumOutlined++;
     }
-    if (OutlinerMode == CGDataMode::Write) {
-      unsigned Count = OF->Candidates.size();
-      LocalHashTree->insert({OF->OutlinedHashSequence, Count});
-    }
   }
 
   LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";);

>From 1a962120092fae0fe32e978f5e4f28fa06cfba5c Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Thu, 25 Apr 2024 22:20:48 -0700
Subject: [PATCH 6/9] [CGData] LLD for MachO

---
 lld/MachO/Config.h               |  1 +
 lld/MachO/Driver.cpp             | 39 +++++++++++++++
 lld/MachO/InputSection.h         |  1 +
 lld/MachO/Options.td             |  2 +
 lld/test/MachO/cgdata-generate.s | 83 ++++++++++++++++++++++++++++++++
 5 files changed, 126 insertions(+)
 create mode 100644 lld/test/MachO/cgdata-generate.s

diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h
index 7b45f7f4c39a1b..87f910c740e6a0 100644
--- a/lld/MachO/Config.h
+++ b/lld/MachO/Config.h
@@ -207,6 +207,7 @@ struct Configuration {
   std::vector<SectionAlign> sectionAlignments;
   std::vector<SegmentProtection> segmentProtections;
   bool ltoDebugPassManager = false;
+  llvm::StringRef codegenDataGeneratePath;
   bool csProfileGenerate = false;
   llvm::StringRef csProfilePath;
   bool pgoWarnMismatch;
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 65de531db04b75..88fb7a03e10960 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -36,6 +36,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/BinaryFormat/Magic.h"
+#include "llvm/CodeGenData/CodeGenDataWriter.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Object/Archive.h"
@@ -1250,6 +1251,38 @@ static void gatherInputSections() {
   }
 }
 
+static void codegenDataGenerate() {
+  TimeTraceScope timeScope("Generating codegen data");
+
+  OutlinedHashTreeRecord globalOutlineRecord;
+  for (ConcatInputSection *isec : inputSections) {
+    if (isec->getSegName() == segment_names::data &&
+        isec->getName() == section_names::outlinedHashTree) {
+      // Read outlined hash tree from each section
+      OutlinedHashTreeRecord localOutlineRecord;
+      auto *data = isec->data.data();
+      localOutlineRecord.deserialize(data);
+
+      // Merge it to the global hash tree.
+      globalOutlineRecord.merge(localOutlineRecord);
+    }
+  }
+
+  CodeGenDataWriter Writer;
+  if (!globalOutlineRecord.empty())
+    Writer.addRecord(globalOutlineRecord);
+
+  std::error_code EC;
+  auto fileName = config->codegenDataGeneratePath;
+  assert(!fileName.empty());
+  raw_fd_ostream Output(fileName, EC, sys::fs::OF_None);
+  if (EC)
+    error("fail to create raw_fd_ostream");
+
+  if (auto E = Writer.write(Output))
+    error("fail to write CGData");
+}
+
 static void foldIdenticalLiterals() {
   TimeTraceScope timeScope("Fold identical literals");
   // We always create a cStringSection, regardless of whether dedupLiterals is
@@ -1665,6 +1698,8 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     config->ignoreAutoLinkOptions.insert(arg->getValue());
   config->strictAutoLink = args.hasArg(OPT_strict_auto_link);
   config->ltoDebugPassManager = args.hasArg(OPT_lto_debug_pass_manager);
+  config->codegenDataGeneratePath =
+      args.getLastArgValue(OPT_codegen_data_generate_path);
   config->csProfileGenerate = args.hasArg(OPT_cs_profile_generate);
   config->csProfilePath = args.getLastArgValue(OPT_cs_profile_path);
   config->pgoWarnMismatch =
@@ -1959,6 +1994,10 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
     }
 
     gatherInputSections();
+
+    if (!config->codegenDataGeneratePath.empty())
+      codegenDataGenerate();
+
     if (config->callGraphProfileSort)
       priorityBuilder.extractCallGraphProfile();
 
diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h
index 0f389e50425a32..bb41cc9119aab4 100644
--- a/lld/MachO/InputSection.h
+++ b/lld/MachO/InputSection.h
@@ -353,6 +353,7 @@ constexpr const char objcMethname[] = "__objc_methname";
 constexpr const char objcNonLazyCatList[] = "__objc_nlcatlist";
 constexpr const char objcNonLazyClassList[] = "__objc_nlclslist";
 constexpr const char objcProtoList[] = "__objc_protolist";
+constexpr const char outlinedHashTree[] = "__llvm_outline";
 constexpr const char pageZero[] = "__pagezero";
 constexpr const char pointers[] = "__pointers";
 constexpr const char rebase[] = "__rebase";
diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td
index 11458d92b3abe0..8f217aa774aee3 100644
--- a/lld/MachO/Options.td
+++ b/lld/MachO/Options.td
@@ -137,6 +137,8 @@ def no_objc_category_merging : Flag<["-"], "no_objc_category_merging">,
     Group<grp_lld>;
 def lto_debug_pass_manager: Flag<["--"], "lto-debug-pass-manager">,
     HelpText<"Debug new pass manager">, Group<grp_lld>;
+def codegen_data_generate_path : Joined<["--"], "codegen-data-generate-path=">,
+    HelpText<"Codegen data file path">, Group<grp_lld>;
 def cs_profile_generate: Flag<["--"], "cs-profile-generate">,
     HelpText<"Perform context sensitive PGO instrumentation">, Group<grp_lld>;
 def cs_profile_path: Joined<["--"], "cs-profile-path=">,
diff --git a/lld/test/MachO/cgdata-generate.s b/lld/test/MachO/cgdata-generate.s
new file mode 100644
index 00000000000000..36846a35e1fbe6
--- /dev/null
+++ b/lld/test/MachO/cgdata-generate.s
@@ -0,0 +1,83 @@
+# REQUIRES: aarch64
+
+# RUN: rm -rf %t; split-file %s %t
+
+# RUN: llvm-mc -filetype obj -triple arm64-apple-darwin %t/merge-1.s -o %t/merge-1.o
+# RUN: llvm-mc -filetype obj -triple arm64-apple-darwin %t/merge-2.s -o %t/merge-2.o
+# RUN: llvm-mc -filetype obj -triple arm64-apple-darwin %t/main.s -o %t/main.o
+
+# This checks if the codegen data from the linker is identical to the merged codegen data
+# from each object file, which is obtained using the llvm-cgdata tool.
+# RUN: %no-arg-lld -dylib -arch arm64 -platform_version ios 14.0 15.0 -o %t/out \
+# RUN: %t/merge-1.o %t/merge-2.o %t/main.o --codegen-data-generate-path=%t/out-cgdata
+# RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o %t/main.o -o %t/merge-cgdata
+# RUN: diff %t/out-cgdata %t/merge-cgdata
+
+# Merge order doesn't matter. `main.o` is dropped due to missing __llvm_outline.
+# RUN: llvm-cgdata merge %t/merge-2.o %t/merge-1.o -o %t/merge-cgdata-shuffle
+# RUN: diff %t/out-cgdata %t/merge-cgdata-shuffle
+
+# We can also generate the merged codegen data from the executable that is not dead-stripped.
+# RUN: llvm-objdump -h %t/out| FileCheck %s
+CHECK: __llvm_outline
+# RUN: llvm-cgdata merge %t/out -o %t/merge-cgdata-exe
+# RUN: diff %t/merge-cgdata-exe %t/merge-cgdata
+
+# Dead-strip will remove __llvm_outline sections from the final executable.
+# But the codeden data is still correctly produced from the linker.
+# RUN: %no-arg-lld -dylib -arch arm64 -platform_version ios 14.0 15.0 -o %t/out-strip \
+# RUN: %t/merge-1.o %t/merge-2.o %t/main.o -dead_strip --codegen-data-generate-path=%t/out-cgdata-strip
+# RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o %t/main.o -o %t/merge-cgdata-strip
+# RUN: diff %t/out-cgdata-strip %t/merge-cgdata-strip
+# RUN: diff %t/out-cgdata-strip %t/merge-cgdata
+
+# Ensure no __llvm_outline section remains in the executable.
+# RUN: llvm-objdump -h %t/out-strip | FileCheck %s --check-prefix=STRIP
+STRIP-NOT: __llvm_outline
+
+#--- merge-1.s
+# The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+#---
+#0:
+#  Hash:            0x0
+#  Terminals:       0
+#  SuccessorIds:    [ 1 ]
+#1:
+#  Hash:            0x1
+#  Terminals:       0
+#  SuccessorIds:    [ 2 ]
+#2:
+#  Hash:            0x2
+#  Terminals:       4
+#  SuccessorIds:    [  ]
+#...
+.section __DATA,__llvm_outline
+_data:
+.byte 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+#--- merge-2.s
+# The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+#---
+#0:
+#  Hash:            0x0
+#  Terminals:       0
+#  SuccessorIds:    [ 1 ]
+#1:
+#  Hash:            0x1
+#  Terminals:       0
+#  SuccessorIds:    [ 2 ]
+#2:
+#  Hash:            0x3
+#  Terminals:       5
+#  SuccessorIds:    [  ]
+#...
+.section __DATA,__llvm_outline
+_data:
+.byte 0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x05,0x00,0x00,0x00,0x00,0x00,0x00,0x00
+
+#--- main.s
+.globl _main
+
+.text
+_main:
+  ret

>From 501af439d75bcc1b061b50d27915704d976fe1c5 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 26 Apr 2024 12:58:54 -0700
Subject: [PATCH 7/9] [Clang][CGData] Flags

---
 clang/include/clang/Driver/Options.td      | 12 ++++++
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 27 +++++++++++++
 clang/lib/Driver/ToolChains/Darwin.cpp     | 46 ++++++++++++++++++++++
 clang/test/Driver/codegen-data.c           | 42 ++++++++++++++++++++
 4 files changed, 127 insertions(+)
 create mode 100644 clang/test/Driver/codegen-data.c

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 52d161703f965e..62e50a807f4f6e 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1834,6 +1834,18 @@ def fprofile_selected_function_group :
   Visibility<[ClangOption, CC1Option]>, MetaVarName<"<i>">,
   HelpText<"Partition functions into N groups using -fprofile-function-groups and select only functions in group i to be instrumented. The valid range is 0 to N-1 inclusive">,
   MarshallingInfoInt<CodeGenOpts<"ProfileSelectedFunctionGroup">>;
+def fcodegen_data_generate : Joined<["-"], "fcodegen-data-generate">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+    HelpText<"Emit codegen data into object file. LLD for MachO (for now) merges them into default.cgdata">;
+def fcodegen_data_generate_EQ : Joined<["-"], "fcodegen-data-generate=">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
+    HelpText<"Emit codegen data into object file. LLD for MachO (for now) merges them into <directory>/default.cgdata">;
+def fcodegen_data_use : Joined<["-"], "fcodegen-data-use">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+    HelpText<"Use codegen data read from default.cgdata to optimize the binary">;
+def fcodegen_data_use_EQ : Joined<["-"], "fcodegen-data-use=">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
+    HelpText<"Use codegen data read from <directory>/default.cgdata to optimize the binary">;
 def fswift_async_fp_EQ : Joined<["-"], "fswift-async-fp=">,
     Group<f_Group>,
     Visibility<[ClangOption, CC1Option, CC1AsOption, CLOption]>,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index b65b96db16bd79..8490955a5ae82c 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2752,6 +2752,33 @@ void tools::addMachineOutlinerArgs(const Driver &D,
       addArg(Twine("-enable-machine-outliner=never"));
     }
   }
+
+  auto *CodeGenDataGenArg =
+      Args.getLastArg(options::OPT_fcodegen_data_generate,
+                      options::OPT_fcodegen_data_generate_EQ);
+  auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
+                                            options::OPT_fcodegen_data_use_EQ);
+
+  // We only allow one of them to be specified.
+  if (CodeGenDataGenArg && CodeGenDataUseArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataUseArg->getAsString(Args);
+
+  // For codegen data gen, the output file is passed to the linker
+  // while a boolean flag is passed to the LLVM backend.
+  if (CodeGenDataGenArg)
+    addArg(Twine("-codegen-data-generate"));
+
+  // For codegen data use, the input file is passed to the LLVM backend.
+  if (CodeGenDataUseArg) {
+    SmallString<128> Path(CodeGenDataUseArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataUseArg->getValue());
+    if (Path.empty() || llvm::sys::fs::is_directory(Path))
+      llvm::sys::path::append(Path, "default.cgdata");
+    addArg(Twine("-codegen-data-use-path=" + Path.str()));
+  }
 }
 
 void tools::addOpenMPDeviceRTL(const Driver &D,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index caf6c4a444fdce..85407ea5c69dfc 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -476,6 +476,19 @@ void darwin::Linker::AddLinkArgs(Compilation &C, const ArgList &Args,
         llvm::sys::path::append(Path, "default.profdata");
       CmdArgs.push_back(Args.MakeArgString(Twine("--cs-profile-path=") + Path));
     }
+
+    auto *CodeGenDataGenArg =
+        Args.getLastArg(options::OPT_fcodegen_data_generate,
+                        options::OPT_fcodegen_data_generate_EQ);
+    if (CodeGenDataGenArg) {
+      SmallString<128> Path(CodeGenDataGenArg->getNumValues() == 0
+                                ? ""
+                                : CodeGenDataGenArg->getValue());
+      if (Path.empty() || llvm::sys::fs::is_directory(Path))
+        llvm::sys::path::append(Path, "default.cgdata");
+      CmdArgs.push_back(
+          Args.MakeArgString(Twine("--codegen-data-generate-path=") + Path));
+    }
   }
 }
 
@@ -633,6 +646,39 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-mllvm");
   CmdArgs.push_back("-enable-linkonceodr-outlining");
 
+  // Propagate codegen data flags to the linker for the LLVM backend.
+  auto *CodeGenDataGenArg =
+      Args.getLastArg(options::OPT_fcodegen_data_generate,
+                      options::OPT_fcodegen_data_generate_EQ);
+  auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
+                                            options::OPT_fcodegen_data_use_EQ);
+
+  // We only allow one of them to be specified.
+  const Driver &D = getToolChain().getDriver();
+  if (CodeGenDataGenArg && CodeGenDataUseArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataUseArg->getAsString(Args);
+
+  // For codegen data gen, the output file is passed to the linker
+  // while a boolean flag is passed to the LLVM backend.
+  if (CodeGenDataGenArg) {
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back("-codegen-data-generate");
+  }
+
+  // For codegen data use, the input file is passed to the LLVM backend.
+  if (CodeGenDataUseArg) {
+    SmallString<128> Path(CodeGenDataUseArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataUseArg->getValue());
+    if (Path.empty() || llvm::sys::fs::is_directory(Path))
+      llvm::sys::path::append(Path, "default.cgdata");
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back(
+        Args.MakeArgString("-codegen-data-use-path=" + Path.str()));
+  }
+
   // Setup statistics file output.
   SmallString<128> StatsFile =
       getStatsFileName(Args, Output, Inputs[0], getToolChain().getDriver());
diff --git a/clang/test/Driver/codegen-data.c b/clang/test/Driver/codegen-data.c
new file mode 100644
index 00000000000000..a72850afc59736
--- /dev/null
+++ b/clang/test/Driver/codegen-data.c
@@ -0,0 +1,42 @@
+// Verify only one of codegen-data flag is passed.
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
+// RUN: not %clang -### -S --target=arm64-apple-darwin  -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
+// CONFLICT: error: invalid argument '-fcodegen-data-generate' not allowed with '-fcodegen-data-use'
+
+// Verify the codegen-data-generate (boolean) flag is passed to LLVM
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate %s  2>&1| FileCheck %s --check-prefix=GENERATE
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-generate %s 2>&1| FileCheck %s --check-prefix=GENERATE
+// GENERATE: "-mllvm" "-codegen-data-generate"
+
+// Verify the codegen-data-use-path flag (with a default value) is passed to LLVM.
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use %s 2>&1| FileCheck %s --check-prefix=USE
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use %s 2>&1| FileCheck %s --check-prefix=USE
+// RUN: mkdir -p %t.d/some/dir
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use=%t.d/some/dir %s 2>&1 | FileCheck %s --check-prefix=USE-DIR
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use=%t.d/some/dir %s 2>&1 | FileCheck %s --check-prefix=USE-DIR
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use=file %s 2>&1 | FileCheck %s --check-prefix=USE-FILE
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use=file %s 2>&1 | FileCheck %s --check-prefix=USE-FILE
+// USE: "-mllvm" "-codegen-data-use-path=default.cgdata"
+// USE-DIR: "-mllvm" "-codegen-data-use-path={{.*}}.d/some/dir{{/|\\\\}}default.cgdata"
+// USE-FILE: "-mllvm" "-codegen-data-use-path=file"
+
+// Verify the codegen-data-generate (boolean) flag with a LTO.
+// RUN: %clang -### -flto --target=aarch64-linux-gnu -fcodegen-data-generate %s 2>&1 | FileCheck %s --check-prefix=GENERATE-LTO
+// GENERATE-LTO: {{ld(.exe)?"}}
+// GENERATE-LTO-SAME: "-plugin-opt=-codegen-data-generate"
+// RUN: %clang -### -flto --target=arm64-apple-darwin -fcodegen-data-generate %s 2>&1 | FileCheck %s --check-prefix=GENERATE-LTO-DARWIN
+// GENERATE-LTO-DARWIN: {{ld(.exe)?"}}
+// GENERATE-LTO-DARWIN-SAME: "-mllvm" "-codegen-data-generate"
+
+// Verify the codegen-data-use-path flag with a LTO is passed to LLVM.
+// RUN: %clang -### -flto=thin --target=aarch64-linux-gnu -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=USE-LTO
+// USE-LTO: {{ld(.exe)?"}}
+// USE-LTO-SAME: "-plugin-opt=-codegen-data-use-path=default.cgdata"
+// RUN: %clang -### -flto=thin --target=arm64-apple-darwin -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=USE-LTO-DARWIN
+// USE-LTO-DARWIN: {{ld(.exe)?"}}
+// USE-LTO-DARWIN-SAME: "-mllvm" "-codegen-data-use-path=default.cgdata"
+
+// For now, LLD MachO supports for generating the codegen data at link time.
+// RUN: %clang -### -fuse-ld=lld -B%S/Inputs/lld --target=arm64-apple-darwin -fcodegen-data-generate %s 2>&1 | FileCheck %s --check-prefix=GENERATE-LLD-DARWIN
+// GENERATE-LLD-DARWIN: {{ld(.exe)?"}}
+// GENERATE-LLD-DARWIN-SAME: "--codegen-data-generate-path=default.cgdata"

>From 14055e2a3b33cdfe923797bfb68f0607cddfdc9e Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 26 Apr 2024 20:02:52 -0700
Subject: [PATCH 8/9] [ThinLTO][NFC] Prep for two-codegen rounds

---
 clang/lib/CodeGen/BackendUtil.cpp  |  8 ++--
 llvm/include/llvm/LTO/LTOBackend.h |  1 +
 llvm/lib/LTO/LTO.cpp               | 77 ++++++++++++++++--------------
 llvm/lib/LTO/LTOBackend.cpp        |  4 +-
 4 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 22c3f8642ad8eb..6586f9b75c940c 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1299,10 +1299,10 @@ static void runThinLTOBackend(
     Conf.CGFileType = getCodeGenFileType(Action);
     break;
   }
-  if (Error E =
-          thinBackend(Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
-                      ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
-                      /* ModuleMap */ nullptr, CGOpts.CmdArgs)) {
+  if (Error E = thinBackend(
+          Conf, -1, AddStream, *M, *CombinedIndex, ImportList,
+          ModuleToDefinedGVSummaries[M->getModuleIdentifier()],
+          /* ModuleMap */ nullptr, Conf.CodeGenOnly, CGOpts.CmdArgs)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       errs() << "Error running ThinLTO backend: " << EIB.message() << '\n';
     });
diff --git a/llvm/include/llvm/LTO/LTOBackend.h b/llvm/include/llvm/LTO/LTOBackend.h
index de89f4bb10dff2..8516398510d4b8 100644
--- a/llvm/include/llvm/LTO/LTOBackend.h
+++ b/llvm/include/llvm/LTO/LTOBackend.h
@@ -56,6 +56,7 @@ Error thinBackend(const Config &C, unsigned Task, AddStreamFn AddStream,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> *ModuleMap,
+                  bool CodeGenOnly,
                   const std::vector<uint8_t> &CmdArgs = std::vector<uint8_t>());
 
 Error finalizeOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 53060df7f503e0..f49a9269c1c4da 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1441,7 +1441,7 @@ class InProcessThinBackend : public ThinBackendProc {
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
   }
 
-  Error runThinLTOBackendThread(
+  virtual Error runThinLTOBackendThread(
       AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
       ModuleSummaryIndex &CombinedIndex,
       const FunctionImporter::ImportMapTy &ImportList,
@@ -1456,7 +1456,8 @@ class InProcessThinBackend : public ThinBackendProc {
         return MOrErr.takeError();
 
       return thinBackend(Conf, Task, AddStream, **MOrErr, CombinedIndex,
-                         ImportList, DefinedGlobals, &ModuleMap);
+                         ImportList, DefinedGlobals, &ModuleMap,
+                         Conf.CodeGenOnly);
     };
 
     auto ModuleID = BM.getModuleIdentifier();
@@ -1827,45 +1828,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   TimeTraceScopeExit.release();
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  auto ProcessOneModule = [&](int I) -> Error {
-    auto &Mod = *(ModuleMap.begin() + I);
-    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
-    // combined module and parallel code generation partitions.
-    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
-                              Mod.second, ImportLists[Mod.first],
-                              ExportLists[Mod.first], ResolvedODR[Mod.first],
-                              ThinLTO.ModuleMap);
+  auto RunBackends = [&](ThinBackendProc *BackendProcess) -> Error {
+    auto ProcessOneModule = [&](int I) -> Error {
+      auto &Mod = *(ModuleMap.begin() + I);
+      // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+      // combined module and parallel code generation partitions.
+      return BackendProcess->start(
+          RegularLTO.ParallelCodeGenParallelismLevel + I, Mod.second,
+          ImportLists[Mod.first], ExportLists[Mod.first],
+          ResolvedODR[Mod.first], ThinLTO.ModuleMap);
+    };
+
+    if (BackendProcess->getThreadCount() == 1) {
+      // Process the modules in the order they were provided on the
+      // command-line. It is important for this codepath to be used for
+      // WriteIndexesThinBackend, to ensure the emitted LinkedObjectsFile lists
+      // ThinLTO objects in the same order as the inputs, which otherwise would
+      // affect the final link order.
+      for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+        if (Error E = ProcessOneModule(I))
+          return E;
+    } else {
+      // When executing in parallel, process largest bitsize modules first to
+      // improve parallelism, and avoid starving the thread pool near the end.
+      // This saves about 15 sec on a 36-core machine while link `clang.exe`
+      // (out of 100 sec).
+      std::vector<BitcodeModule *> ModulesVec;
+      ModulesVec.reserve(ModuleMap.size());
+      for (auto &Mod : ModuleMap)
+        ModulesVec.push_back(&Mod.second);
+      for (int I : generateModulesOrdering(ModulesVec))
+        if (Error E = ProcessOneModule(I))
+          return E;
+    }
+    return BackendProcess->wait();
   };
 
-  if (BackendProc->getThreadCount() == 1) {
-    // Process the modules in the order they were provided on the command-line.
-    // It is important for this codepath to be used for WriteIndexesThinBackend,
-    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
-    // order as the inputs, which otherwise would affect the final link order.
-    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
-      if (Error E = ProcessOneModule(I))
-        return E;
-  } else {
-    // When executing in parallel, process largest bitsize modules first to
-    // improve parallelism, and avoid starving the thread pool near the end.
-    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
-    // of 100 sec).
-    std::vector<BitcodeModule *> ModulesVec;
-    ModulesVec.reserve(ModuleMap.size());
-    for (auto &Mod : ModuleMap)
-      ModulesVec.push_back(&Mod.second);
-    for (int I : generateModulesOrdering(ModulesVec))
-      if (Error E = ProcessOneModule(I))
-        return E;
-  }
-  return BackendProc->wait();
+  std::unique_ptr<ThinBackendProc> BackendProc =
+      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                      AddStream, Cache);
+  return RunBackends(BackendProc.get());
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index 71e8849dc3cc91..f7dc9d11a2abfe 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -555,7 +555,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> *ModuleMap,
-                       const std::vector<uint8_t> &CmdArgs) {
+                       bool CodeGenOnly, const std::vector<uint8_t> &CmdArgs) {
   Expected<const Target *> TOrErr = initAndLookupTarget(Conf, Mod);
   if (!TOrErr)
     return TOrErr.takeError();
@@ -576,7 +576,7 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   Mod.setPartialSampleProfileRatio(CombinedIndex);
 
   LLVM_DEBUG(dbgs() << "Running ThinLTO\n");
-  if (Conf.CodeGenOnly) {
+  if (CodeGenOnly) {
     codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }

>From 4f986ab15f65bce2fb4b07fe06bfe52d9e437091 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 27 Apr 2024 07:52:51 -0700
Subject: [PATCH 9/9] [ThinLTO][CGData] Global Outlining with Two-CodeGen
 Rounds

---
 clang/include/clang/Driver/Options.td         |   6 +
 clang/lib/Driver/ToolChains/CommonArgs.cpp    |  25 ++++
 clang/lib/Driver/ToolChains/Darwin.cpp        |  26 +++-
 clang/test/Driver/codegen-data.c              |  18 +++
 llvm/include/llvm/CodeGenData/CodeGenData.h   |  13 ++
 llvm/lib/CodeGenData/CodeGenData.cpp          |  69 ++++++++++-
 llvm/lib/LTO/LTO.cpp                          | 112 +++++++++++++++++-
 llvm/lib/LTO/LTOBackend.cpp                   |   9 ++
 .../AArch64/cgdata-read-single-outline.ll     |  42 +++++++
 .../test/ThinLTO/AArch64/cgdata-two-rounds.ll |  95 +++++++++++++++
 llvm/test/ThinLTO/AArch64/lit.local.cfg       |   2 +
 11 files changed, 411 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
 create mode 100644 llvm/test/ThinLTO/AArch64/lit.local.cfg

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 62e50a807f4f6e..ff905d4dc99e62 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1846,6 +1846,12 @@ def fcodegen_data_use : Joined<["-"], "fcodegen-data-use">,
 def fcodegen_data_use_EQ : Joined<["-"], "fcodegen-data-use=">,
     Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
     HelpText<"Use codegen data read from <directory>/default.cgdata to optimize the binary">;
+def fcodegen_data_thinlto_two_rounds : Joined<["-"], "fcodegen-data-thinlto-two-rounds">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
+    HelpText<"ThinLTO runs codegen twice by serializing and deserializing IRs to and from a temp directory. Applies to ThinLTO bitcodes only">;
+def fcodegen_data_thinlto_two_rounds_EQ : Joined<["-"], "fcodegen-data-thinlto-two-rounds=">,
+    Group<f_Group>, Visibility<[ClangOption, CC1Option]>, MetaVarName<"<directory>">,
+    HelpText<"ThinLTO runs codegen twice by serializing and deserializing IRs to and from <directory>. Applies to ThinLTO bitcodes only">;
 def fswift_async_fp_EQ : Joined<["-"], "fswift-async-fp=">,
     Group<f_Group>,
     Visibility<[ClangOption, CC1Option, CC1AsOption, CLOption]>,
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8490955a5ae82c..88a7f38533f860 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -2758,12 +2758,23 @@ void tools::addMachineOutlinerArgs(const Driver &D,
                       options::OPT_fcodegen_data_generate_EQ);
   auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
                                             options::OPT_fcodegen_data_use_EQ);
+  auto *CodeGenDataTwoRoundsArg =
+      Args.getLastArg(options::OPT_fcodegen_data_thinlto_two_rounds,
+                      options::OPT_fcodegen_data_thinlto_two_rounds_EQ);
 
   // We only allow one of them to be specified.
   if (CodeGenDataGenArg && CodeGenDataUseArg)
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << CodeGenDataGenArg->getAsString(Args)
         << CodeGenDataUseArg->getAsString(Args);
+  if (CodeGenDataGenArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
+  if (CodeGenDataUseArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataUseArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
 
   // For codegen data gen, the output file is passed to the linker
   // while a boolean flag is passed to the LLVM backend.
@@ -2779,6 +2790,20 @@ void tools::addMachineOutlinerArgs(const Driver &D,
       llvm::sys::path::append(Path, "default.cgdata");
     addArg(Twine("-codegen-data-use-path=" + Path.str()));
   }
+
+  // For codegen data thinlto two rounds, the output directory needs to
+  // be passed. A temp directory is created if it does not exist.
+  // In fact, this flag is needed for the thinlto's link flag only.
+  if (CodeGenDataTwoRoundsArg) {
+    SmallString<128> Path(CodeGenDataTwoRoundsArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataTwoRoundsArg->getValue());
+    if (!Path.empty() && !llvm::sys::fs::is_directory(Path))
+      D.Diag(diag::err_drv_unable_to_set_working_directory) << Path.str();
+    if (Path.empty())
+      llvm::sys::fs::createUniqueDirectory("cgdata", Path);
+    addArg(Twine("-codegen-data-thinlto-two-rounds-path=" + Path.str()));
+  }
 }
 
 void tools::addOpenMPDeviceRTL(const Driver &D,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 85407ea5c69dfc..89b43ed1e281b9 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -652,6 +652,9 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                       options::OPT_fcodegen_data_generate_EQ);
   auto *CodeGenDataUseArg = Args.getLastArg(options::OPT_fcodegen_data_use,
                                             options::OPT_fcodegen_data_use_EQ);
+  auto *CodeGenDataTwoRoundsArg =
+      Args.getLastArg(options::OPT_fcodegen_data_thinlto_two_rounds,
+                      options::OPT_fcodegen_data_thinlto_two_rounds_EQ);
 
   // We only allow one of them to be specified.
   const Driver &D = getToolChain().getDriver();
@@ -659,9 +662,17 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     D.Diag(diag::err_drv_argument_not_allowed_with)
         << CodeGenDataGenArg->getAsString(Args)
         << CodeGenDataUseArg->getAsString(Args);
+  if (CodeGenDataGenArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataGenArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
+  if (CodeGenDataUseArg && CodeGenDataTwoRoundsArg)
+    D.Diag(diag::err_drv_argument_not_allowed_with)
+        << CodeGenDataUseArg->getAsString(Args)
+        << CodeGenDataTwoRoundsArg->getAsString(Args);
 
   // For codegen data gen, the output file is passed to the linker
-  // while a boolean flag is passed to the LLVM backend.
+  // while a boolean flag is passed to the LTO backend.
   if (CodeGenDataGenArg) {
     CmdArgs.push_back("-mllvm");
     CmdArgs.push_back("-codegen-data-generate");
@@ -679,6 +690,19 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         Args.MakeArgString("-codegen-data-use-path=" + Path.str()));
   }
 
+  // For codegen data thinlto two rounds, the output directory needs to
+  // be passed. A temp directory is created if it does not exist.
+  if (CodeGenDataTwoRoundsArg) {
+    SmallString<128> Path(CodeGenDataTwoRoundsArg->getNumValues() == 0
+                              ? ""
+                              : CodeGenDataTwoRoundsArg->getValue());
+    if (Path.empty())
+      llvm::sys::fs::createUniqueDirectory("cgdata", Path);
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back(Args.MakeArgString(
+        "-codegen-data-thinlto-two-rounds-path=" + Path.str()));
+  }
+
   // Setup statistics file output.
   SmallString<128> StatsFile =
       getStatsFileName(Args, Output, Inputs[0], getToolChain().getDriver());
diff --git a/clang/test/Driver/codegen-data.c b/clang/test/Driver/codegen-data.c
index a72850afc59736..ebfa19f1dcc927 100644
--- a/clang/test/Driver/codegen-data.c
+++ b/clang/test/Driver/codegen-data.c
@@ -2,6 +2,24 @@
 // RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
 // RUN: not %clang -### -S --target=arm64-apple-darwin  -fcodegen-data-generate -fcodegen-data-use %s 2>&1 | FileCheck %s --check-prefix=CONFLICT
 // CONFLICT: error: invalid argument '-fcodegen-data-generate' not allowed with '-fcodegen-data-use'
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-2
+// RUN: not %clang -### -S --target=arm64-apple-darwin -fcodegen-data-generate -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-2
+// CONFLICT-2: error: invalid argument '-fcodegen-data-generate' not allowed with '-fcodegen-data-thinlto-two-rounds'
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-use -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-3
+// RUN: not %clang -### -S --target=arm64-apple-darwin -fcodegen-data-use -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=CONFLICT-3
+// CONFLICT-3: error: invalid argument '-fcodegen-data-use' not allowed with '-fcodegen-data-thinlto-two-rounds'
+
+// Verify the codegen-data-thinlto-two-rounds-path must have a valid directory path passed to LLVM.
+// RUN: not %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-thinlto-two-rounds=file %s 2>&1 | FileCheck %s --check-prefix=ROUND-FILE
+// RUN: not %clang -### -S --target=arm64-apple-darwin -fcodegen-data-thinlto-two-rounds=file %s 2>&1 | FileCheck %s --check-prefix=ROUND-FILE
+// ROUND-FILE: error: unable to set working directory: file
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIRTEMP
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-thinlto-two-rounds %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIRTEMP
+// ROUND-DIRTEMP: "-mllvm" "-codegen-data-thinlto-two-rounds-path={{.*}}"
+// RUN: mkdir -p %t.d/some
+// RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-thinlto-two-rounds=%t.d/some %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIR
+// RUN: %clang -### -S --target=arm64-apple-darwin -fcodegen-data-thinlto-two-rounds=%t.d/some %s 2>&1 | FileCheck %s --check-prefix=ROUND-DIR
+// ROUND-DIR: "-mllvm" "-codegen-data-thinlto-two-rounds-path={{.*}}.d/some"
 
 // Verify the codegen-data-generate (boolean) flag is passed to LLVM
 // RUN: %clang -### -S --target=aarch64-linux-gnu -fcodegen-data-generate %s  2>&1| FileCheck %s --check-prefix=GENERATE
diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h
index 118fb9841d27e8..d2650bbd49ed56 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenData.h
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.h
@@ -164,6 +164,19 @@ publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
   CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
 }
 
+/// Save the current module before the first codegen round.
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task);
+
+/// Load the current module  before the second codegen round.
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context);
+
+/// Merge the codegen data from the input files in scratch vector in ThinLTO
+/// two-codegen rounds.
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles);
+
 void warn(Error E, StringRef Whence = "");
 void warn(Twine Message, std::string Whence = "", std::string Hint = "");
 
diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp
index 841af8a347eeb2..f90514899d2923 100644
--- a/llvm/lib/CodeGenData/CodeGenData.cpp
+++ b/llvm/lib/CodeGenData/CodeGenData.cpp
@@ -30,6 +30,10 @@ cl::opt<bool>
 cl::opt<std::string>
     CodeGenDataUsePath("codegen-data-use-path", cl::init(""), cl::Hidden,
                        cl::desc("File path to where .cgdata file is read"));
+cl::opt<std::string> CodeGenDataThinLTOTwoRoundsPath(
+    "codegen-data-thinlto-two-rounds-path", cl::init(""), cl::Hidden,
+    cl::desc("Directory path to where the optimized bitcodes are saved and "
+             "restored."));
 
 static std::string getCGDataErrString(cgdata_error Err,
                                       const std::string &ErrMsg = "") {
@@ -140,7 +144,7 @@ CodeGenData &CodeGenData::getInstance() {
     auto *CGD = new CodeGenData();
     Instance.reset(CGD);
 
-    if (CodeGenDataGenerate)
+    if (CodeGenDataGenerate || !CodeGenDataThinLTOTwoRoundsPath.empty())
       CGD->EmitCGData = true;
     else if (!CodeGenDataUsePath.empty()) {
       // Initialize the global CGData if the input file name is given.
@@ -216,6 +220,69 @@ void warn(Error E, StringRef Whence) {
   }
 }
 
+static std::string getPath(const std::string &Dir, unsigned Task) {
+  return (Dir + "/" + llvm::Twine(Task) + ".saved_copy.bc").str();
+}
+
+void saveModuleForTwoRounds(const Module &TheModule, unsigned Task) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::OpenFlags::OF_None);
+  if (EC)
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to save optimized bitcode\n");
+  WriteBitcodeToFile(TheModule, OS, /* ShouldPreserveUseListOrder */ true);
+}
+
+std::unique_ptr<Module> loadModuleForTwoRounds(BitcodeModule &OrigModule,
+                                               unsigned Task,
+                                               LLVMContext &Context) {
+  assert(sys::fs::is_directory(CodeGenDataThinLTOTwoRoundsPath));
+  std::string Path = getPath(CodeGenDataThinLTOTwoRoundsPath, Task);
+  auto FileOrError = MemoryBuffer::getFile(Path);
+  if (!FileOrError)
+    report_fatal_error(Twine("Failed to open ") + Path +
+                       " to load optimized bitcode\n");
+
+  std::unique_ptr<MemoryBuffer> FileBuffer = std::move(*FileOrError);
+  auto RestoredModule = llvm::parseBitcodeFile(*FileBuffer, Context);
+  if (!RestoredModule)
+    report_fatal_error(Twine("Failed to parse optimized bitcode loaded from ") +
+                       Path + "\n");
+
+  // Restore the original module identifier.
+  (*RestoredModule)->setModuleIdentifier(OrigModule.getModuleIdentifier());
+  return std::move(*RestoredModule);
+}
+
+Error mergeCodeGenData(
+    const std::unique_ptr<std::vector<llvm::SmallString<0>>> InputFiles) {
+
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &InputFile : *(InputFiles)) {
+    if (InputFile.empty())
+      continue;
+    StringRef File = StringRef(InputFile.data(), InputFile.size());
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBuffer(
+        File, "in-memory object file", /*RequiresNullTerminator=*/false);
+    Expected<std::unique_ptr<object::ObjectFile>> BinOrErr =
+        object::ObjectFile::createObjectFile(Buffer->getMemBufferRef());
+    if (!BinOrErr)
+      return BinOrErr.takeError();
+
+    std::unique_ptr<object::ObjectFile> &Obj = BinOrErr.get();
+    if (auto E = CodeGenDataReader::mergeFromObjectFile(Obj.get(),
+                                                        GlobalOutlineRecord))
+      return E;
+  }
+
+  if (!GlobalOutlineRecord.empty())
+    cgdata::publishOutlinedHashTree(std::move(GlobalOutlineRecord.HashTree));
+
+  return Error::success();
+}
+
 } // end namespace cgdata
 
 } // end namespace llvm
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index f49a9269c1c4da..09d2ab1e4b4eea 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGenData/CodeGenData.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
@@ -69,6 +70,10 @@ static cl::opt<bool>
     DumpThinCGSCCs("dump-thin-cg-sccs", cl::init(false), cl::Hidden,
                    cl::desc("Dump the SCCs in the ThinLTO index's callgraph"));
 
+/// Path to where the optimized bitcodes are saved and restored for ThinLTO
+/// two-codegen rounds.
+extern cl::opt<std::string> CodeGenDataThinLTOTwoRoundsPath;
+
 namespace llvm {
 /// Enable global value internalization in LTO.
 cl::opt<bool> EnableLTOInternalization(
@@ -1543,6 +1548,66 @@ class InProcessThinBackend : public ThinBackendProc {
     return BackendThreadPool.getMaxConcurrency();
   }
 };
+
+// This Backend will run ThinBackend process but throw away all the output from
+// the codegen. This class facilitates the first codegen round.
+class NoOutputThinBackend : public InProcessThinBackend {
+public:
+  NoOutputThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch)
+      : InProcessThinBackend(
+            Conf, CombinedIndex, ThinLTOParallelism, ModuleToDefinedGVSummaries,
+            // This lambda is the reason why Scratch is a unique_ptr that is
+            // constructed outside of this class's constructor. The Scratch
+            // space needs to be fully allocated so that its address does not
+            // change after we create this lambda, which depends on its address
+            // remaining the same.
+            // There may be a cleaner way to do this but this way seems to work.
+            [Allocation = &*Scratch](unsigned Task, const Twine &ModuleName) {
+              return std::make_unique<CachedFileStream>(
+                  std::make_unique<raw_svector_ostream>((*Allocation)[Task]));
+            },
+            FileCache(), nullptr, false, false),
+        Scratch(std::move(Scratch)) {}
+
+  /// This vector is just scratch space where the output of the ThinBackend can
+  /// be written and then thrown away during destruction.
+  std::unique_ptr<std::vector<llvm::SmallString<0>>> Scratch;
+};
+
+// This Backend performs codegen on bitcode that was previously saved after
+// going through optimization. This class facilitates the second codegen round.
+class OptimizedBitcodeThinBackend : public InProcessThinBackend {
+public:
+  OptimizedBitcodeThinBackend(
+      const Config &Conf, ModuleSummaryIndex &CombinedIndex,
+      ThreadPoolStrategy ThinLTOParallelism,
+      const DenseMap<StringRef, GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+      AddStreamFn AddStream)
+      : InProcessThinBackend(Conf, CombinedIndex, ThinLTOParallelism,
+                             ModuleToDefinedGVSummaries, AddStream, FileCache(),
+                             nullptr, false, false) {}
+
+  virtual Error runThinLTOBackendThread(
+      AddStreamFn AddStream, FileCache Cache, unsigned Task, BitcodeModule BM,
+      ModuleSummaryIndex &CombinedIndex,
+      const FunctionImporter::ImportMapTy &ImportList,
+      const FunctionImporter::ExportSetTy &ExportList,
+      const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
+      const GVSummaryMapTy &DefinedGlobals,
+      MapVector<StringRef, BitcodeModule> &ModuleMap) override {
+    LTOLLVMContext BackendContext(Conf);
+    std::unique_ptr<Module> LoadedModule =
+        cgdata::loadModuleForTwoRounds(BM, Task, BackendContext);
+
+    return thinBackend(Conf, Task, AddStream, *LoadedModule, CombinedIndex,
+                       ImportList, DefinedGlobals, &ModuleMap,
+                       /*CodeGenOnly*/ true);
+  }
+};
 } // end anonymous namespace
 
 ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism,
@@ -1867,10 +1932,49 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
     return BackendProcess->wait();
   };
 
-  std::unique_ptr<ThinBackendProc> BackendProc =
-      ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
-                      AddStream, Cache);
-  return RunBackends(BackendProc.get());
+  if (CodeGenDataThinLTOTwoRoundsPath.empty()) {
+    std::unique_ptr<ThinBackendProc> BackendProc =
+        ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
+                        AddStream, Cache);
+    return RunBackends(BackendProc.get());
+  }
+
+  // Two-codegen rounds:
+  // 1. The first round: Run opt + codegen with a scratch output.
+  // 2. Merge codegen data extracted from the scratch output.
+  // 3. The second round: Run codegen again.
+  LLVM_DEBUG(dbgs() << "Running ThinLTO two-codegen rounds\n");
+
+  // Ensure we have a directory to write the bitcode files for two-codegen
+  // rounds.
+  if (auto EC = sys::fs::create_directories(CodeGenDataThinLTOTwoRoundsPath,
+                                            /*IgnoreExisting=*/true))
+    return errorCodeToError(EC);
+
+  // Create a scratch output.
+  auto Outputs = std::make_unique<std::vector<llvm::SmallString<0>>>();
+  Outputs->resize(getMaxTasks());
+  auto FirstRoundLTO = std::make_unique<NoOutputThinBackend>(
+      Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+      ModuleToDefinedGVSummaries, std::move(Outputs));
+  // The first round: Run opt + codegen with a scratch output.
+  // Before codegen, we serilized modules to CodeGenDataThinLTOTwoRoundsPath.
+  if (Error E = RunBackends(FirstRoundLTO.get()))
+    return E;
+
+  // Using the scratch output, we merge codegen data.
+  if (Error E = cgdata::mergeCodeGenData(std::move(FirstRoundLTO->Scratch)))
+    return E;
+
+  // The second round: Run codegen by reading IRs from
+  // CodeGenDataThinLTOTwoRoundsPath.
+  std::unique_ptr<ThinBackendProc> SecondRoundLTO =
+      std::make_unique<OptimizedBitcodeThinBackend>(
+          Conf, ThinLTO.CombinedIndex, llvm::heavyweight_hardware_concurrency(),
+          ModuleToDefinedGVSummaries, AddStream);
+  Error E = RunBackends(SecondRoundLTO.get());
+
+  return E;
 }
 
 Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index f7dc9d11a2abfe..243669f56ebcf7 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CodeGenData/CodeGenData.h"
 #include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
@@ -74,6 +75,8 @@ static cl::opt<bool> ThinLTOAssumeMerged(
     cl::desc("Assume the input has already undergone ThinLTO function "
              "importing and the other pre-optimization pipeline changes."));
 
+extern cl::opt<std::string> CodeGenDataThinLTOTwoRoundsPath;
+
 namespace llvm {
 extern cl::opt<bool> NoPGOWarnMismatch;
 }
@@ -592,6 +595,12 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                  CmdArgs))
           return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
+        // Save the current module before the first codegen round.
+        // Note the second codegen round has been already bailed out with
+        // CodeGenOnly.
+        if (!CodeGenDataThinLTOTwoRoundsPath.empty())
+          cgdata::saveModuleForTwoRounds(Mod, Task);
+
         codegen(Conf, TM, AddStream, Task, Mod, CombinedIndex);
         return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       };
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll b/llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll
new file mode 100644
index 00000000000000..2c606a5a13007e
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-read-single-outline.ll
@@ -0,0 +1,42 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; using codegen data that has been read from a previous codegen run.
+
+; RUN: split-file %s %t
+
+; First, we generate the cgdata file from a local outline instance present in local-two.ll.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-generate=true -filetype=obj %t/local-two.ll -o %t_write
+; RUN: llvm-cgdata merge %t_write -o %t_cgdata
+; RUN: llvm-cgdata show %t_cgdata | FileCheck %s --check-prefix=SHOW
+
+; SHOW: Outlined hash tree:
+; SHOW-NEXT:  Total Node Count: 4
+; SHOW-NEXT:  Terminal Node Count: 1
+; SHOW-NEXT:  Depth: 3
+
+; Now, we read the cgdata in the machine outliner, enabling us to optimistically
+; outline a singleton instance in local-one.ll that matches against the cgdata.
+; RUN: llc -mtriple=arm64-apple-darwin -codegen-data-use-path=%t_cgdata -filetype=obj %t/local-one.ll -o %t_read
+; RUN: llvm-objdump -d %t_read | FileCheck %s
+
+; CHECK: _OUTLINED_FUNCTION
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  mov
+; CHECK-NEXT:  b
+
+;--- local-two.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- local-one.ll
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
diff --git a/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
new file mode 100644
index 00000000000000..bb7292477ea250
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/cgdata-two-rounds.ll
@@ -0,0 +1,95 @@
+; This test verifies whether we can outline a singleton instance (i.e., an instance that does not repeat)
+; by running two codegen rounds.
+
+; RUN: split-file %s %t
+
+; Verify each outlining instance is singleton with the global outlining for thinlto.
+; They will be identical, which can be folded by the linker with ICF.
+; RUN: opt -module-summary %t/thin-one.ll -o %t/thin-one.bc
+; RUN: opt -module-summary %t/thin-two.ll -o %t/thin-two.bc
+; RUN: mkdir -p %t/two-rounds
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc -o %t/thinlto \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds-path=%t/two-rounds
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.1 | FileCheck %s --check-prefix=THINLTO-1
+; THINLTO-1: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  mov
+; THINLTO-1-NEXT:  b
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/thinlto.2 | FileCheck %s --check-prefix=THINLTO-2
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+; THINLTO-2: _OUTLINED_FUNCTION{{.*}}>:
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  mov
+; THINLTO-2-NEXT:  b
+
+; Now add a lto module to the above thinlto modules.
+; Verify the lto module is optimized independent of the global outlining for thinlto.
+; RUN: opt %t/lto.ll -o %t/lto.bc
+; RUN: llvm-lto2 run %t/thin-one.bc %t/thin-two.bc %t/lto.bc -o %t/out \
+; RUN:  -r %t/thin-one.bc,_f3,px -r %t/thin-one.bc,_g,x \
+; RUN:  -r %t/thin-two.bc,_f1,px -r %t/thin-two.bc,_f2,px -r %t/thin-two.bc,_g,x \
+; RUN:  -r %t/lto.bc,_f4,px -r %t/lto.bc,_f5,px -r %t/lto.bc,_f6,px -r %t/lto.bc,_g,x \
+; RUN:  -codegen-data-thinlto-two-rounds-path=%t/two-rounds
+
+; lto.ll will have one outlining instance within the lto module itself (no global outlining).
+; RUN: llvm-objdump -d %t/out.0 | FileCheck %s --check-prefix=LTO-0
+; LTO-0: _OUTLINED_FUNCTION{{.*}}>:
+; LTO-0-NEXT:  mov
+; LTO-0-NEXT:  b
+
+; thin-one.ll will have one outlining instance (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.1 | FileCheck %s --check-prefix=THINLTO-1
+
+; thin-two.ll will have two outlining instances (matched in the global outlined hash tree)
+; RUN: llvm-objdump -d %t/out.2 | FileCheck %s --check-prefix=THINLTO-2
+
+;--- thin-one.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f3() minsize {
+  %1 = call i32 @g(i32 30, i32 1, i32 2);
+ ret i32 %1
+}
+
+;--- thin-two.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f1() minsize {
+  %1 = call i32 @g(i32 10, i32 1, i32 2);
+  ret i32 %1
+}
+define i32 @f2() minsize {
+  %1 = call i32 @g(i32 20, i32 1, i32 2);
+  ret i32 %1
+}
+
+;--- lto.ll
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-darwin"
+
+declare i32 @g(i32, i32, i32)
+define i32 @f4() minsize {
+  %1 = call i32 @g(i32 10, i32 30, i32 2);
+  ret i32 %1
+}
+define i32 @f5() minsize {
+  %1 = call i32 @g(i32 20, i32 40, i32 2);
+  ret i32 %1
+}
+define i32 @f6() minsize {
+  %1 = call i32 @g(i32 50, i32 60, i32 2);
+  ret i32 %1
+}
diff --git a/llvm/test/ThinLTO/AArch64/lit.local.cfg b/llvm/test/ThinLTO/AArch64/lit.local.cfg
new file mode 100644
index 00000000000000..10d4a0e953ed47
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "AArch64" in config.root.targets:
+    config.unsupported = True