[llvm] [CGData] llvm-cgdata (PR #89884)

Thu Apr 25 00:14:18 PDT 2024

https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/89884

>From e405e025a84e3d9d04384ace7a2cdab1f16d38d4 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Mon, 22 Apr 2024 15:29:25 -0700
Subject: [PATCH 1/2] [CGData][OutlinedHashTree] Define OutlinedHashTree

This defines the OutlinedHashTree class.
It contains sequences of stable hash values of instructions that have been outlined.
This OutlinedHashTree can be used to track the outlined instruction sequences across modules.
A trie structure is used in its implementation, allowing for a compact sharing of common prefixes.
---
 .../llvm/CodeGenData/OutlinedHashTree.h       | 131 ++++++++++++++
 .../llvm/CodeGenData/OutlinedHashTreeRecord.h |  67 +++++++
 llvm/lib/CMakeLists.txt                       |   1 +
 llvm/lib/CodeGenData/CMakeLists.txt           |  14 ++
 llvm/lib/CodeGenData/OutlinedHashTree.cpp     | 106 +++++++++++
 .../CodeGenData/OutlinedHashTreeRecord.cpp    | 166 ++++++++++++++++++
 llvm/unittests/CMakeLists.txt                 |   1 +
 llvm/unittests/CodeGenData/CMakeLists.txt     |  14 ++
 .../OutlinedHashTreeRecordTest.cpp            | 118 +++++++++++++
 .../CodeGenData/OutlinedHashTreeTest.cpp      |  81 +++++++++
 10 files changed, 699 insertions(+)
 create mode 100644 llvm/include/llvm/CodeGenData/OutlinedHashTree.h
 create mode 100644 llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
 create mode 100644 llvm/lib/CodeGenData/CMakeLists.txt
 create mode 100644 llvm/lib/CodeGenData/OutlinedHashTree.cpp
 create mode 100644 llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
 create mode 100644 llvm/unittests/CodeGenData/CMakeLists.txt
 create mode 100644 llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
 create mode 100644 llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp

diff --git a/llvm/include/llvm/CodeGenData/OutlinedHashTree.h b/llvm/include/llvm/CodeGenData/OutlinedHashTree.h
new file mode 100644
index 00000000000000..84fa8aba827207
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/OutlinedHashTree.h
@@ -0,0 +1,131 @@
+//===- OutlinedHashTree.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This defines the OutlinedHashTree class. It contains sequences of stable
+// hash values of instructions that have been outlined. This OutlinedHashTree
+// can be used to track the outlined instruction sequences across modules.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_OUTLINEDHASHTREE_H
+#define LLVM_CODEGENDATA_OUTLINEDHASHTREE_H
+
+#include "llvm/ADT/StableHashing.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+
+/// A HashNode is an entry in an OutlinedHashTree, holding a hash value
+/// and a collection of Successors (other HashNodes). If a HashNode has
+/// a positive terminal value (Terminals > 0), it signifies the end of
+/// a hash sequence with that occurrence count.
+struct HashNode {
+  /// The hash value of the node.
+  stable_hash Hash;
+  /// The number of terminals in the sequence ending at this node.
+  unsigned Terminals;
+  /// The successors of this node.
+  std::unordered_map<stable_hash, std::unique_ptr<HashNode>> Successors;
+};
+
+/// HashNodeStable is the serialized, stable, and compact representation
+/// of a HashNode.
+struct HashNodeStable {
+  llvm::yaml::Hex64 Hash;
+  unsigned Terminals;
+  std::vector<unsigned> SuccessorIds;
+};
+
+class OutlinedHashTree {
+
+  using EdgeCallbackFn =
+      std::function<void(const HashNode *, const HashNode *)>;
+  using NodeCallbackFn = std::function<void(const HashNode *)>;
+
+  using HashSequence = std::vector<stable_hash>;
+  using HashSequencePair = std::pair<std::vector<stable_hash>, unsigned>;
+
+  /// Walks every edge and node in the OutlinedHashTree and calls CallbackEdge
+  /// for the edges and CallbackNode for the nodes with the stable_hash for
+  /// the source and the stable_hash of the sink for an edge. These generic
+  /// callbacks can be used to traverse a OutlinedHashTree for the purpose of
+  /// print debugging or serializing it.
+  void walkGraph(EdgeCallbackFn CallbackEdge,
+                 NodeCallbackFn CallbackNode) const;
+
+public:
+  /// Walks the nodes of a OutlinedHashTree using walkGraph.
+  void walkVertices(NodeCallbackFn Callback) const {
+    walkGraph([](const HashNode *A, const HashNode *B) {}, Callback);
+  }
+
+  /// Release all hash nodes except the root hash node.
+  void clear() {
+    assert(getRoot()->Hash == 0 && getRoot()->Terminals == 0);
+    getRoot()->Successors.clear();
+  }
+
+  /// \returns true if the hash tree has only the root node.
+  bool empty() { return size() == 1; }
+
+  /// \returns the size of a OutlinedHashTree by traversing it. If
+  /// \p GetTerminalCountOnly is true, it only counts the terminal nodes
+  /// (meaning it returns the size of the number of hash sequences in a
+  /// OutlinedHashTree).
+  size_t size(bool GetTerminalCountOnly = false) const {
+    size_t Size = 0;
+    walkVertices([&Size, GetTerminalCountOnly](const HashNode *N) {
+      Size += (N && (!GetTerminalCountOnly || N->Terminals));
+    });
+    return Size;
+  }
+
+  /// \returns the depth of a OutlinedHashTree by traversing it.
+  size_t depth() const {
+    size_t Size = 0;
+    std::unordered_map<const HashNode *, size_t> DepthMap;
+
+    walkGraph(
+        [&DepthMap](const HashNode *Src, const HashNode *Dst) {
+          size_t Depth = DepthMap[Src];
+          DepthMap[Dst] = Depth + 1;
+        },
+        [&Size, &DepthMap](const HashNode *N) {
+          Size = std::max(Size, DepthMap[N]);
+        });
+
+    return Size;
+  }
+
+  /// \returns the root hash node of a OutlinedHashTree.
+  const HashNode *getRoot() const { return Root.get(); }
+  HashNode *getRoot() { return Root.get(); }
+
+  /// Inserts a \p Sequence into the this tree. The last node in the sequence
+  /// will increase Terminals.
+  void insert(const HashSequencePair &SequencePair);
+
+  /// Merge a \p OtherTree into this Tree.
+  void merge(const OutlinedHashTree *OtherTree);
+
+  /// \returns the matching count if \p Sequence exists in a OutlinedHashTree.
+  unsigned find(const HashSequence &Sequence) const;
+
+  OutlinedHashTree() { Root = std::make_unique<HashNode>(); }
+
+private:
+  std::unique_ptr<HashNode> Root;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h b/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
new file mode 100644
index 00000000000000..ccd2ad26dd0871
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
@@ -0,0 +1,67 @@
+//===- OutlinedHashTreeRecord.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This defines the OutlinedHashTreeRecord class. This class holds the outlined
+// hash tree for both serialization and deserialization processes. It utilizes
+// two data formats for serialization: raw binary data and YAML.
+// These two formats can be used interchangeably.
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
+#define LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
+
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+
+namespace llvm {
+
+using IdHashNodeStableMapTy = std::map<unsigned, HashNodeStable>;
+using IdHashNodeMapTy = std::map<unsigned, HashNode *>;
+using HashNodeIdMapTy = std::unordered_map<const HashNode *, unsigned>;
+
+struct OutlinedHashTreeRecord {
+  std::unique_ptr<OutlinedHashTree> HashTree;
+
+  OutlinedHashTreeRecord() { HashTree = std::make_unique<OutlinedHashTree>(); }
+  OutlinedHashTreeRecord(std::unique_ptr<OutlinedHashTree> HashTree)
+      : HashTree(std::move(HashTree)){};
+
+  /// Serialize the outlined hash tree to a raw_ostream.
+  void serialize(raw_ostream &OS) const;
+  /// Deserialize the outlined hash tree from a raw_ostream.
+  void deserialize(const unsigned char *&Ptr);
+  /// Serialize the outlined hash tree to a YAML stream.
+  void serializeYAML(yaml::Output &YOS) const;
+  /// Deserialize the outlined hash tree from a YAML stream.
+  void deserializeYAML(yaml::Input &YIS);
+
+  /// Merge the other outlined hash tree into this one.
+  void merge(const OutlinedHashTreeRecord &Other) {
+    HashTree->merge(Other.HashTree.get());
+  }
+
+  /// \returns true if the outlined hash tree is empty.
+  bool empty() const { return HashTree->empty(); }
+
+  /// Print the outlined hash tree in a YAML format.
+  void print(raw_ostream &OS = llvm::errs()) const {
+    yaml::Output YOS(OS);
+    serializeYAML(YOS);
+  }
+
+private:
+  /// Convert the outlined hash tree to stable data.
+  void convertToStableData(IdHashNodeStableMapTy &IdNodeStableMap) const;
+
+  /// Convert the stable data back to the outlined hash tree.
+  void convertFromStableData(const IdHashNodeStableMapTy &IdNodeStableMap);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index 74e2d03c07953d..2ac0b0dc026e16 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -10,6 +10,7 @@ add_subdirectory(InterfaceStub)
 add_subdirectory(IRPrinter)
 add_subdirectory(IRReader)
 add_subdirectory(CodeGen)
+add_subdirectory(CodeGenData)
 add_subdirectory(CodeGenTypes)
 add_subdirectory(BinaryFormat)
 add_subdirectory(Bitcode)
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt
new file mode 100644
index 00000000000000..3ba90f96cc86d4
--- /dev/null
+++ b/llvm/lib/CodeGenData/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_llvm_component_library(LLVMCodeGenData
+  OutlinedHashTree.cpp
+  OutlinedHashTreeRecord.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGenData
+
+  DEPENDS
+  intrinsics_gen
+
+  LINK_COMPONENTS
+  Core
+  Support
+  )
diff --git a/llvm/lib/CodeGenData/OutlinedHashTree.cpp b/llvm/lib/CodeGenData/OutlinedHashTree.cpp
new file mode 100644
index 00000000000000..945014550a3886
--- /dev/null
+++ b/llvm/lib/CodeGenData/OutlinedHashTree.cpp
@@ -0,0 +1,106 @@
+//===-- OutlinedHashTree.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An OutlinedHashTree is a Trie that contains sequences of stable hash values
+// of instructions that have been outlined. This OutlinedHashTree can be used
+// to understand the outlined instruction sequences collected across modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+
+#include <stack>
+#include <tuple>
+
+#define DEBUG_TYPE "outlined-hash-tree"
+
+using namespace llvm;
+
+void OutlinedHashTree::walkGraph(EdgeCallbackFn CallbackEdge,
+                                 NodeCallbackFn CallbackNode) const {
+  std::stack<const HashNode *> Stack;
+  Stack.push(getRoot());
+
+  while (!Stack.empty()) {
+    const auto *Current = Stack.top();
+    Stack.pop();
+    CallbackNode(Current);
+
+    // Sorted walk for the stable output.
+    std::map<stable_hash, const HashNode *> SortedSuccessors;
+    for (const auto &P : Current->Successors)
+      SortedSuccessors[P.first] = P.second.get();
+
+    for (const auto &P : SortedSuccessors) {
+      CallbackEdge(Current, P.second);
+      Stack.push(P.second);
+    }
+  }
+}
+
+void OutlinedHashTree::insert(const HashSequencePair &SequencePair) {
+  const auto &Sequence = SequencePair.first;
+  unsigned Count = SequencePair.second;
+
+  HashNode *Current = getRoot();
+  for (stable_hash StableHash : Sequence) {
+    auto I = Current->Successors.find(StableHash);
+    if (I == Current->Successors.end()) {
+      std::unique_ptr<HashNode> Next = std::make_unique<HashNode>();
+      HashNode *NextPtr = Next.get();
+      NextPtr->Hash = StableHash;
+      Current->Successors.emplace(StableHash, std::move(Next));
+      Current = NextPtr;
+      continue;
+    }
+    Current = I->second.get();
+  }
+  Current->Terminals += Count;
+}
+
+void OutlinedHashTree::merge(const OutlinedHashTree *Tree) {
+  HashNode *Dst = getRoot();
+  const HashNode *Src = Tree->getRoot();
+
+  std::stack<std::pair<HashNode *, const HashNode *>> Stack;
+  Stack.push({Dst, Src});
+
+  while (!Stack.empty()) {
+    auto [DstNode, SrcNode] = Stack.top();
+    Stack.pop();
+
+    if (!SrcNode)
+      continue;
+    DstNode->Terminals += SrcNode->Terminals;
+
+    for (auto &[Hash, NextSrcNode] : SrcNode->Successors) {
+      HashNode *NextDstNode;
+      auto I = DstNode->Successors.find(Hash);
+      if (I == DstNode->Successors.end()) {
+        auto NextDst = std::make_unique<HashNode>();
+        NextDstNode = NextDst.get();
+        NextDstNode->Hash = Hash;
+        DstNode->Successors.emplace(Hash, std::move(NextDst));
+      } else
+        NextDstNode = I->second.get();
+
+      Stack.push({NextDstNode, NextSrcNode.get()});
+    }
+  }
+}
+
+unsigned OutlinedHashTree::find(const HashSequence &Sequence) const {
+  const HashNode *Current = getRoot();
+  for (stable_hash StableHash : Sequence) {
+    const auto I = Current->Successors.find(StableHash);
+    if (I == Current->Successors.end())
+      return 0;
+    Current = I->second.get();
+  }
+  return Current->Terminals;
+}
diff --git a/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp b/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
new file mode 100644
index 00000000000000..26dcd70cf50667
--- /dev/null
+++ b/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
@@ -0,0 +1,166 @@
+//===-- OutlinedHashTreeRecord.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This defines the OutlinedHashTreeRecord class. This class holds the outlined
+// hash tree for both serialization and deserialization processes. It utilizes
+// two data formats for serialization: raw binary data and YAML.
+// These two formats can be used interchangeably.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+#define DEBUG_TYPE "outlined-hash-tree"
+
+using namespace llvm;
+using namespace llvm::support;
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<HashNodeStable> {
+  static void mapping(IO &io, HashNodeStable &res) {
+    io.mapRequired("Hash", res.Hash);
+    io.mapRequired("Terminals", res.Terminals);
+    io.mapRequired("SuccessorIds", res.SuccessorIds);
+  }
+};
+
+template <> struct CustomMappingTraits<IdHashNodeStableMapTy> {
+  static void inputOne(IO &io, StringRef Key, IdHashNodeStableMapTy &V) {
+    HashNodeStable NodeStable;
+    io.mapRequired(Key.str().c_str(), NodeStable);
+    unsigned Id;
+    if (Key.getAsInteger(0, Id)) {
+      io.setError("Id not an integer");
+      return;
+    }
+    V.insert({Id, NodeStable});
+  }
+
+  static void output(IO &io, IdHashNodeStableMapTy &V) {
+    for (auto Iter = V.begin(); Iter != V.end(); ++Iter)
+      io.mapRequired(utostr(Iter->first).c_str(), Iter->second);
+  }
+};
+
+} // namespace yaml
+} // namespace llvm
+
+void OutlinedHashTreeRecord::serialize(raw_ostream &OS) const {
+  IdHashNodeStableMapTy IdNodeStableMap;
+  convertToStableData(IdNodeStableMap);
+
+  support::endian::Writer Writer(OS, endianness::little);
+  Writer.write<uint32_t>(IdNodeStableMap.size());
+  for (const auto &[Id, NodeStable] : IdNodeStableMap) {
+    Writer.write<uint32_t>(Id);
+    Writer.write<uint64_t>(NodeStable.Hash);
+    Writer.write<uint32_t>(NodeStable.Terminals);
+    Writer.write<uint32_t>(NodeStable.SuccessorIds.size());
+    for (auto SuccessorId : NodeStable.SuccessorIds)
+      Writer.write<uint32_t>(SuccessorId);
+  }
+}
+
+void OutlinedHashTreeRecord::deserialize(const unsigned char *&Ptr) {
+  IdHashNodeStableMapTy IdNodeStableMap;
+
+  auto NumIdNodeStableMap =
+      endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+  for (unsigned I = 0; I < NumIdNodeStableMap; ++I) {
+    auto Id = endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    HashNodeStable NodeStable;
+    NodeStable.Hash =
+        endian::readNext<uint64_t, endianness::little, unaligned>(Ptr);
+    NodeStable.Terminals =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    auto NumSuccessorIds =
+        endian::readNext<uint32_t, endianness::little, unaligned>(Ptr);
+    for (unsigned J = 0; J < NumSuccessorIds; ++J)
+      NodeStable.SuccessorIds.push_back(
+          endian::readNext<uint32_t, endianness::little, unaligned>(Ptr));
+
+    IdNodeStableMap[Id] = std::move(NodeStable);
+  }
+
+  convertFromStableData(IdNodeStableMap);
+}
+
+void OutlinedHashTreeRecord::serializeYAML(yaml::Output &YOS) const {
+  IdHashNodeStableMapTy IdNodeStableMap;
+  convertToStableData(IdNodeStableMap);
+
+  YOS << IdNodeStableMap;
+}
+
+void OutlinedHashTreeRecord::deserializeYAML(yaml::Input &YIS) {
+  IdHashNodeStableMapTy IdNodeStableMap;
+
+  YIS >> IdNodeStableMap;
+  YIS.nextDocument();
+
+  convertFromStableData(IdNodeStableMap);
+}
+
+void OutlinedHashTreeRecord::convertToStableData(
+    IdHashNodeStableMapTy &IdNodeStableMap) const {
+  // Build NodeIdMap
+  HashNodeIdMapTy NodeIdMap;
+  HashTree->walkVertices([&NodeIdMap](const HashNode *Current) {
+    size_t Index = NodeIdMap.size();
+    NodeIdMap[Current] = Index;
+    assert(Index = NodeIdMap.size() + 1 &&
+                   "Expected size of NodeMap to increment by 1");
+  });
+
+  // Convert NodeIdMap to NodeStableMap
+  for (auto &P : NodeIdMap) {
+    auto *Node = P.first;
+    auto Id = P.second;
+    HashNodeStable NodeStable;
+    NodeStable.Hash = Node->Hash;
+    NodeStable.Terminals = Node->Terminals;
+    for (auto &P : Node->Successors)
+      NodeStable.SuccessorIds.push_back(NodeIdMap[P.second.get()]);
+    IdNodeStableMap[Id] = NodeStable;
+  }
+
+  // Sort the Successors so that they come out in the same order as in the map.
+  for (auto &P : IdNodeStableMap)
+    std::sort(P.second.SuccessorIds.begin(), P.second.SuccessorIds.end());
+}
+
+void OutlinedHashTreeRecord::convertFromStableData(
+    const IdHashNodeStableMapTy &IdNodeStableMap) {
+  IdHashNodeMapTy IdNodeMap;
+  // Initialize the root node at 0.
+  IdNodeMap[0] = HashTree->getRoot();
+  assert(IdNodeMap[0]->Successors.empty());
+
+  for (auto &P : IdNodeStableMap) {
+    auto Id = P.first;
+    const HashNodeStable &NodeStable = P.second;
+    assert(IdNodeMap.count(Id));
+    HashNode *Curr = IdNodeMap[Id];
+    Curr->Hash = NodeStable.Hash;
+    Curr->Terminals = NodeStable.Terminals;
+    auto &Successors = Curr->Successors;
+    assert(Successors.empty());
+    for (auto SuccessorId : NodeStable.SuccessorIds) {
+      auto Sucessor = std::make_unique<HashNode>();
+      IdNodeMap[SuccessorId] = Sucessor.get();
+      auto Hash = IdNodeStableMap.at(SuccessorId).Hash;
+      Successors[Hash] = std::move(Sucessor);
+    }
+  }
+}
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 46f30ff398e10d..cb4b8513e6d02e 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -21,6 +21,7 @@ add_subdirectory(BinaryFormat)
 add_subdirectory(Bitcode)
 add_subdirectory(Bitstream)
 add_subdirectory(CodeGen)
+add_subdirectory(CodeGenData)
 add_subdirectory(DebugInfo)
 add_subdirectory(Debuginfod)
 add_subdirectory(Demangle)
diff --git a/llvm/unittests/CodeGenData/CMakeLists.txt b/llvm/unittests/CodeGenData/CMakeLists.txt
new file mode 100644
index 00000000000000..3d821b87e29d8c
--- /dev/null
+++ b/llvm/unittests/CodeGenData/CMakeLists.txt
@@ -0,0 +1,14 @@
+set(LLVM_LINK_COMPONENTS
+  ${LLVM_TARGETS_TO_BUILD}
+  CodeGen
+  CodeGenData
+  Core
+  Support
+  )
+
+add_llvm_unittest(CodeGenDataTests
+  OutlinedHashTreeRecordTest.cpp
+  OutlinedHashTreeTest.cpp
+  )
+
+target_link_libraries(CodeGenDataTests PRIVATE LLVMTestingSupport)
diff --git a/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp b/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
new file mode 100644
index 00000000000000..aa7ad4a33754ff
--- /dev/null
+++ b/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
@@ -0,0 +1,118 @@
+//===- OutlinedHashTreeRecordTest.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(OutlinedHashTreeRecordTest, Empty) {
+  OutlinedHashTreeRecord HashTreeRecord;
+  ASSERT_TRUE(HashTreeRecord.empty());
+}
+
+TEST(OutlinedHashTreeRecordTest, Print) {
+  OutlinedHashTreeRecord HashTreeRecord;
+  HashTreeRecord.HashTree->insert({{1, 2}, 3});
+
+  const char *ExpectedTreeStr = R"(---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       3
+  SuccessorIds:    [  ]
+...
+)";
+  std::string TreeDump;
+  raw_string_ostream OS(TreeDump);
+  HashTreeRecord.print(OS);
+  EXPECT_EQ(ExpectedTreeStr, TreeDump);
+}
+
+TEST(OutlinedHashTreeRecordTest, Stable) {
+  OutlinedHashTreeRecord HashTreeRecord1;
+  HashTreeRecord1.HashTree->insert({{1, 2}, 4});
+  HashTreeRecord1.HashTree->insert({{1, 3}, 5});
+
+  OutlinedHashTreeRecord HashTreeRecord2;
+  HashTreeRecord2.HashTree->insert({{1, 3}, 5});
+  HashTreeRecord2.HashTree->insert({{1, 2}, 4});
+
+  // Output is stable regardless of insertion order.
+  std::string TreeDump1;
+  raw_string_ostream OS1(TreeDump1);
+  HashTreeRecord1.print(OS1);
+  std::string TreeDump2;
+  raw_string_ostream OS2(TreeDump2);
+  HashTreeRecord2.print(OS2);
+
+  EXPECT_EQ(TreeDump1, TreeDump2);
+}
+
+TEST(OutlinedHashTreeRecordTest, Serialize) {
+  OutlinedHashTreeRecord HashTreeRecord1;
+  HashTreeRecord1.HashTree->insert({{1, 2}, 4});
+  HashTreeRecord1.HashTree->insert({{1, 3}, 5});
+
+  // Serialize and deserialize the tree.
+  SmallVector<char> Out;
+  raw_svector_ostream OS(Out);
+  HashTreeRecord1.serialize(OS);
+
+  OutlinedHashTreeRecord HashTreeRecord2;
+  const uint8_t *Data = reinterpret_cast<const uint8_t *>(Out.data());
+  HashTreeRecord2.deserialize(Data);
+
+  // Two trees should be identical.
+  std::string TreeDump1;
+  raw_string_ostream OS1(TreeDump1);
+  HashTreeRecord1.print(OS1);
+  std::string TreeDump2;
+  raw_string_ostream OS2(TreeDump2);
+  HashTreeRecord2.print(OS2);
+
+  EXPECT_EQ(TreeDump1, TreeDump2);
+}
+
+TEST(OutlinedHashTreeRecordTest, SerializeYAML) {
+  OutlinedHashTreeRecord HashTreeRecord1;
+  HashTreeRecord1.HashTree->insert({{1, 2}, 4});
+  HashTreeRecord1.HashTree->insert({{1, 3}, 5});
+
+  // Serialize and deserialize the tree in a YAML format.
+  std::string Out;
+  raw_string_ostream OS(Out);
+  yaml::Output YOS(OS);
+  HashTreeRecord1.serializeYAML(YOS);
+
+  OutlinedHashTreeRecord HashTreeRecord2;
+  yaml::Input YIS(StringRef(Out.data(), Out.size()));
+  HashTreeRecord2.deserializeYAML(YIS);
+
+  // Two trees should be identical.
+  std::string TreeDump1;
+  raw_string_ostream OS1(TreeDump1);
+  HashTreeRecord1.print(OS1);
+  std::string TreeDump2;
+  raw_string_ostream OS2(TreeDump2);
+  HashTreeRecord2.print(OS2);
+
+  EXPECT_EQ(TreeDump1, TreeDump2);
+}
+
+} // end namespace
diff --git a/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp b/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp
new file mode 100644
index 00000000000000..d11618cf8e4fae
--- /dev/null
+++ b/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp
@@ -0,0 +1,81 @@
+//===- OutlinedHashTreeTest.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(OutlinedHashTreeTest, Empty) {
+  OutlinedHashTree HashTree;
+  ASSERT_TRUE(HashTree.empty());
+  // The header node is always present.
+  ASSERT_TRUE(HashTree.size() == 1);
+  ASSERT_TRUE(HashTree.depth() == 0);
+}
+
+TEST(OutlinedHashTreeTest, Insert) {
+  OutlinedHashTree HashTree;
+  HashTree.insert({{1, 2, 3}, 1});
+  // The node count is 4 (including the root node).
+  ASSERT_TRUE(HashTree.size() == 4);
+  // The terminal count is 1.
+  ASSERT_TRUE(HashTree.size(/*GetTerminalCountOnly=*/true) == 1);
+  // The depth is 3.
+  ASSERT_TRUE(HashTree.depth() == 3);
+
+  HashTree.clear();
+  ASSERT_TRUE(HashTree.empty());
+
+  HashTree.insert({{1, 2, 3}, 1});
+  HashTree.insert({{1, 2, 4}, 2});
+  // The nodes of 1 and 2 are shared with the same prefix.
+  // The nodes are root, 1, 2, 3 and 4, whose counts are 5.
+  ASSERT_TRUE(HashTree.size() == 5);
+}
+
+TEST(OutlinedHashTreeTest, Find) {
+  OutlinedHashTree HashTree;
+  HashTree.insert({{1, 2, 3}, 1});
+  HashTree.insert({{1, 2, 3}, 2});
+
+  // The node count does not change as the same sequences are added.
+  ASSERT_TRUE(HashTree.size() == 4);
+  // The terminal counts are accumulated from two same sequences.
+  ASSERT_TRUE(HashTree.find({1, 2, 3}) == 3);
+  ASSERT_TRUE(HashTree.find({1, 2}) == 0);
+}
+
+TEST(OutlinedHashTreeTest, Merge) {
+  // Build HashTree1 inserting 2 sequences.
+  OutlinedHashTree HashTree1;
+
+  HashTree1.insert({{1, 2}, 20});
+  HashTree1.insert({{1, 4}, 30});
+
+  // Build HashTree2 and HashTree3 for each
+  OutlinedHashTree HashTree2;
+  HashTree2.insert({{1, 2}, 20});
+  OutlinedHashTree HashTree3;
+  HashTree3.insert({{1, 4}, 30});
+
+  // Merge HashTree3 into HashTree2.
+  HashTree2.merge(&HashTree3);
+
+  // Compare HashTree1 and HashTree2.
+  EXPECT_EQ(HashTree1.size(), HashTree2.size());
+  EXPECT_EQ(HashTree1.depth(), HashTree2.depth());
+  EXPECT_EQ(HashTree1.find({1, 2}), HashTree2.find({1, 2}));
+  EXPECT_EQ(HashTree1.find({1, 4}), HashTree2.find({1, 4}));
+  EXPECT_EQ(HashTree1.find({1, 3}), HashTree2.find({1, 3}));
+}
+
+} // end namespace

>From 11148d8e51cd0c3671c2ba982436d2df8f6747fd Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Tue, 23 Apr 2024 14:22:14 -0700
Subject: [PATCH 2/2] [CGData] llvm-cgdata

The llvm-cgdata tool has been introduced to handle reading and writing of codegen data. This data includes an optimistic codegen summary that can be utilized to enhance subsequent codegen. Currently, the tool supports saving and restoring the outlined hash tree, facilitating machine function outlining across modules. Additional codegen summaries can be incorporated into separate sections as required. This patch primarily establishes basic support for the reader and writer, similar to llvm-profdata.

The high-level operations of llvm-cgdata are as follows:
1. It reads local raw codegen data from a custom section (for example, __llvm_outline)  embedded in native binary files
2. It merges local raw codegen data into an indexed codegen data, complete with a suitable header.
3. It handles reading and writing of the indexed codegen data into a standalone file.
---
 llvm/include/llvm/CodeGenData/CodeGenData.h   | 202 +++++++++++++
 llvm/include/llvm/CodeGenData/CodeGenData.inc |  46 +++
 .../llvm/CodeGenData/CodeGenDataReader.h      | 154 ++++++++++
 .../llvm/CodeGenData/CodeGenDataWriter.h      |  68 +++++
 llvm/lib/CodeGenData/CMakeLists.txt           |   3 +
 llvm/lib/CodeGenData/CodeGenData.cpp          | 197 +++++++++++++
 llvm/lib/CodeGenData/CodeGenDataReader.cpp    | 174 +++++++++++
 llvm/lib/CodeGenData/CodeGenDataWriter.cpp    | 161 +++++++++++
 llvm/test/CMakeLists.txt                      |   1 +
 llvm/test/lit.cfg.py                          |   1 +
 llvm/test/tools/llvm-cgdata/dump.test         |  31 ++
 llvm/test/tools/llvm-cgdata/empty.test        |  32 +++
 llvm/test/tools/llvm-cgdata/error.test        |  38 +++
 .../test/tools/llvm-cgdata/merge-archive.test |  75 +++++
 llvm/test/tools/llvm-cgdata/merge-concat.test |  68 +++++
 llvm/test/tools/llvm-cgdata/merge-double.test |  74 +++++
 llvm/test/tools/llvm-cgdata/merge-single.test |  43 +++
 llvm/test/tools/llvm-cgdata/show.test         |  30 ++
 llvm/tools/llvm-cgdata/CMakeLists.txt         |  15 +
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        | 270 ++++++++++++++++++
 20 files changed, 1683 insertions(+)
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenData.h
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenData.inc
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenDataReader.h
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
 create mode 100644 llvm/lib/CodeGenData/CodeGenData.cpp
 create mode 100644 llvm/lib/CodeGenData/CodeGenDataReader.cpp
 create mode 100644 llvm/lib/CodeGenData/CodeGenDataWriter.cpp
 create mode 100644 llvm/test/tools/llvm-cgdata/dump.test
 create mode 100644 llvm/test/tools/llvm-cgdata/empty.test
 create mode 100644 llvm/test/tools/llvm-cgdata/error.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-archive.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-concat.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-double.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-single.test
 create mode 100644 llvm/test/tools/llvm-cgdata/show.test
 create mode 100644 llvm/tools/llvm-cgdata/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-cgdata/llvm-cgdata.cpp

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h
new file mode 100644
index 00000000000000..118fb9841d27e8
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.h
@@ -0,0 +1,202 @@
+//===- CodeGenData.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for codegen data that has stable summary which
+// can be used to optimize the code in the subsequent codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATA_H
+#define LLVM_CODEGENDATA_CODEGENDATA_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/TargetParser/Triple.h"
+#include <mutex>
+
+namespace llvm {
+
+enum CGDataSectKind {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Kind,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+std::string getCodeGenDataSectionName(CGDataSectKind CGSK,
+                                      Triple::ObjectFormatType OF,
+                                      bool AddSegmentInfo = true);
+
+enum class CGDataKind {
+  Unknown = 0x0,
+  // A function outlining info.
+  FunctionOutlinedHashTree = 0x1,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionOutlinedHashTree)
+};
+
+const std::error_category &cgdata_category();
+
+enum class cgdata_error {
+  success = 0,
+  eof,
+  bad_magic,
+  bad_header,
+  empty_cgdata,
+  malformed,
+  unsupported_version,
+};
+
+inline std::error_code make_error_code(cgdata_error E) {
+  return std::error_code(static_cast<int>(E), cgdata_category());
+}
+
+class CGDataError : public ErrorInfo<CGDataError> {
+public:
+  CGDataError(cgdata_error Err, const Twine &ErrStr = Twine())
+      : Err(Err), Msg(ErrStr.str()) {
+    assert(Err != cgdata_error::success && "Not an error");
+  }
+
+  std::string message() const override;
+
+  void log(raw_ostream &OS) const override { OS << message(); }
+
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(Err);
+  }
+
+  cgdata_error get() const { return Err; }
+  const std::string &getMessage() const { return Msg; }
+
+  /// Consume an Error and return the raw enum value contained within it, and
+  /// the optional error message. The Error must either be a success value, or
+  /// contain a single CGDataError.
+  static std::pair<cgdata_error, std::string> take(Error E) {
+    auto Err = cgdata_error::success;
+    std::string Msg = "";
+    handleAllErrors(std::move(E), [&Err, &Msg](const CGDataError &IPE) {
+      assert(Err == cgdata_error::success && "Multiple errors encountered");
+      Err = IPE.get();
+      Msg = IPE.getMessage();
+    });
+    return {Err, Msg};
+  }
+
+  static char ID;
+
+private:
+  cgdata_error Err;
+  std::string Msg;
+};
+
+enum CGDataMode {
+  None,
+  Read,
+  Write,
+};
+
+class CodeGenData {
+  /// Global outlined hash tree that has oulined hash sequences across modules.
+  std::unique_ptr<OutlinedHashTree> PublishedHashTree;
+
+  /// This flag is set when -fcgdata-generate is passed.
+  /// Or, it can be mutated with -ftwo-codegen-rounds during two codegen runs.
+  bool EmitCGData;
+
+  /// This is a singleton instance which is thread-safe. Unlike profile data
+  /// which is largely function-based, codegen data describes the whole module.
+  /// Therefore, this can be initialized once, and can be used across modules
+  /// instead of constructing the same one for each codegen backend.
+  static std::unique_ptr<CodeGenData> Instance;
+  static std::once_flag OnceFlag;
+
+  CodeGenData() = default;
+
+public:
+  ~CodeGenData() = default;
+
+  static CodeGenData &getInstance();
+
+  /// Returns true if we have a valid outlined hash tree.
+  bool hasOutlinedHashTree() {
+    return PublishedHashTree && !PublishedHashTree->empty();
+  }
+
+  /// Returns the outlined hash tree. This can be globally used in a read-only
+  /// manner.
+  const OutlinedHashTree *getOutlinedHashTree() {
+    return PublishedHashTree.get();
+  }
+
+  /// Returns true if we should write codegen data.
+  bool emitCGData() { return EmitCGData; }
+
+  /// Publish the (globally) merged or read outlined hash tree.
+  void publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
+    PublishedHashTree = std::move(HashTree);
+    // Ensure we disable emitCGData as we do not want to read and write both.
+    EmitCGData = false;
+  }
+};
+
+namespace cgdata {
+
+inline bool hasOutlinedHashTree() {
+  return CodeGenData::getInstance().hasOutlinedHashTree();
+}
+
+inline const OutlinedHashTree *getOutlinedHashTree() {
+  return CodeGenData::getInstance().getOutlinedHashTree();
+}
+
+inline bool emitCGData() { return CodeGenData::getInstance().emitCGData(); }
+
+inline void
+publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
+  CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
+}
+
+void warn(Error E, StringRef Whence = "");
+void warn(Twine Message, std::string Whence = "", std::string Hint = "");
+
+} // end namespace cgdata
+
+namespace IndexedCGData {
+
+const uint64_t Magic = 0x81617461646763ff; // "\xffcgdata\x81"
+
+enum CGDataVersion {
+  // Version 1 is the first version. This version support the outlined
+  // hash tree.
+  Version1 = 1,
+  CurrentVersion = CG_DATA_INDEX_VERSION
+};
+const uint64_t Version = CGDataVersion::CurrentVersion;
+
+struct Header {
+  uint64_t Magic;
+  uint32_t Version;
+  uint32_t DataKind;
+  uint64_t OutlinedHashTreeOffset;
+
+  // New fields should only be added at the end to ensure that the size
+  // computation is correct. The methods below need to be updated to ensure that
+  // the new field is read correctly.
+
+  // Reads a header struct from the buffer.
+  static Expected<Header> readFromBuffer(const unsigned char *Curr);
+};
+
+} // end namespace IndexedCGData
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_PREPARE_H
diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.inc b/llvm/include/llvm/CodeGenData/CodeGenData.inc
new file mode 100644
index 00000000000000..5f6df5c0bf1065
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.inc
@@ -0,0 +1,46 @@
+/*===-- CodeGenData.inc ----------------------------------------*- C++ -*-=== *\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+/*
+ * This is the main file that defines all the data structure, signature,
+ * constant literals that are shared across compiler, host tools (reader/writer)
+ * to support codegen data.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifdef CG_DATA_SECT_ENTRY
+#define CG_DATA_DEFINED
+CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON),
+                   CG_DATA_OUTLINE_COFF, "__DATA,")
+
+#undef CG_DATA_SECT_ENTRY
+#endif
+
+/* section name strings common to all targets other
+   than WIN32 */
+#define CG_DATA_OUTLINE_COMMON __llvm_outline
+/* Since cg data sections are not allocated, we don't need to
+ * access them at runtime.
+ */
+#define CG_DATA_OUTLINE_COFF ".loutline"
+
+#ifdef _WIN32
+/* Runtime section names and name strings.  */
+#define CG_DATA_SECT_NAME CG_DATA_OUTLINE_COFF
+
+#else
+/* Runtime section names and name strings.  */
+#define CG_DATA_SECT_NAME INSTR_PROF_QUOTE(CG_DATA_OUTLINE_COMMON)
+
+#endif
+
+/* Indexed codegen data format version (start from 1). */
+#define CG_DATA_INDEX_VERSION 1
+
+/* Helper macros.  */
+#define CG_DATA_SIMPLE_QUOTE(x) #x
+#define CG_DATA_QUOTE(x) CG_DATA_SIMPLE_QUOTE(x)
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
new file mode 100644
index 00000000000000..df4ae3ed24e79a
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
@@ -0,0 +1,154 @@
+//===- CodeGenDataReader.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATAREADER_H
+#define LLVM_CODEGENDATA_CODEGENDATAREADER_H
+
+#include "llvm/CodeGenData/CodeGenData.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+namespace llvm {
+
+class CodeGenDataReader {
+  cgdata_error LastError = cgdata_error::success;
+  std::string LastErrorMsg;
+
+public:
+  CodeGenDataReader() = default;
+  virtual ~CodeGenDataReader() = default;
+
+  /// Read the header.  Required before reading first record.
+  virtual Error read() = 0;
+  /// Return the codegen data version.
+  virtual uint32_t getVersion() const = 0;
+  /// Return the codegen data kind.
+  virtual CGDataKind getDataKind() const = 0;
+  /// Return true if the data has an outlined hash tree.
+  virtual bool hasOutlinedHashTree() const = 0;
+  /// Return the outlined hash tree that is released from the reader.
+  std::unique_ptr<OutlinedHashTree> releaseOutlinedHashTree() {
+    return std::move(HashTreeRecord.HashTree);
+  }
+
+  /// Factory method to create an appropriately typed reader for the given
+  /// codegen data file path and file system.
+  static Expected<std::unique_ptr<CodeGenDataReader>>
+  create(const Twine &Path, vfs::FileSystem &FS);
+
+  /// Factory method to create an appropriately typed reader for the given
+  /// memory buffer.
+  static Expected<std::unique_ptr<CodeGenDataReader>>
+  create(std::unique_ptr<MemoryBuffer> Buffer);
+
+  /// Extract the cgdata embedded in sections from the given object file and
+  /// merge them into the GlobalOutlineRecord. This is a static helper that
+  /// is used by `llvm-cgdata merge` or ThinLTO's two-codegen rounds.
+  static Error mergeFromObjectFile(const object::ObjectFile *Obj,
+                                   OutlinedHashTreeRecord &GlobalOutlineRecord);
+
+protected:
+  /// The outlined hash tree that has been read. When it's released by
+  /// releaseOutlinedHashTree(), it's no longer valid.
+  OutlinedHashTreeRecord HashTreeRecord;
+
+  /// Set the current error and return same.
+  Error error(cgdata_error Err, const std::string &ErrMsg = "") {
+    LastError = Err;
+    LastErrorMsg = ErrMsg;
+    if (Err == cgdata_error::success)
+      return Error::success();
+    return make_error<CGDataError>(Err, ErrMsg);
+  }
+
+  Error error(Error &&E) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      LastError = IPE.get();
+      LastErrorMsg = IPE.getMessage();
+    });
+    return make_error<CGDataError>(LastError, LastErrorMsg);
+  }
+
+  /// Clear the current error and return a successful one.
+  Error success() { return error(cgdata_error::success); }
+};
+
+class IndexedCodeGenDataReader : public CodeGenDataReader {
+  /// The codegen data file contents.
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// The header
+  IndexedCGData::Header Header;
+
+public:
+  IndexedCodeGenDataReader(std::unique_ptr<MemoryBuffer> DataBuffer)
+      : DataBuffer(std::move(DataBuffer)) {}
+  IndexedCodeGenDataReader(const IndexedCodeGenDataReader &) = delete;
+  IndexedCodeGenDataReader &
+  operator=(const IndexedCodeGenDataReader &) = delete;
+
+  /// Return true if the given buffer is in binary codegen data format.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+  /// Read the contents including the header.
+  Error read() override;
+  /// Return the codegen data version.
+  uint32_t getVersion() const override { return Header.Version; }
+  /// Return the codegen data kind.
+  CGDataKind getDataKind() const override {
+    return static_cast<CGDataKind>(Header.DataKind);
+  }
+  /// Return true if the header indicates the data has an outlined hash tree.
+  /// This does not mean that the data is still available.
+  bool hasOutlinedHashTree() const override {
+    return Header.DataKind &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+};
+
+/// This format is a simple text format that's suitable for test data.
+/// The header is a custom format starting with `:` per line to indicate which
+/// codegen data is recorded. `#` is used to indicate a comment.
+/// The subsequent data is a YAML format per each codegen data in order.
+/// Currently, it only has a function outlined hash tree.
+class TextCodeGenDataReader : public CodeGenDataReader {
+  /// The codegen data file contents.
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// Iterator over the profile data.
+  line_iterator Line;
+  /// Describe the kind of the codegen data.
+  CGDataKind DataKind = CGDataKind::Unknown;
+
+public:
+  TextCodeGenDataReader(std::unique_ptr<MemoryBuffer> DataBuffer_)
+      : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#') {}
+  TextCodeGenDataReader(const TextCodeGenDataReader &) = delete;
+  TextCodeGenDataReader &operator=(const TextCodeGenDataReader &) = delete;
+
+  /// Return true if the given buffer is in text codegen data format.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+  /// Read the contents including the header.
+  Error read() override;
+  /// Text format does not have version, so return 0.
+  uint32_t getVersion() const override { return 0; }
+  /// Return the codegen data kind.
+  CGDataKind getDataKind() const override { return DataKind; }
+  /// Return true if the header indicates the data has an outlined hash tree.
+  /// This does not mean that the data is still available.
+  bool hasOutlinedHashTree() const override {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_CODEGENDATAREADER_H
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
new file mode 100644
index 00000000000000..e17ffc3482ec91
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
@@ -0,0 +1,68 @@
+//===- CodeGenDataWriter.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+#define LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+
+#include "llvm/CodeGenData/CodeGenData.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
+class CGDataOStream;
+
+class CodeGenDataWriter {
+  /// The outlined hash tree to be written.
+  OutlinedHashTreeRecord HashTreeRecord;
+
+  /// A bit mask describing the kind of the codegen data.
+  CGDataKind DataKind = CGDataKind::Unknown;
+
+public:
+  CodeGenDataWriter() = default;
+  ~CodeGenDataWriter() = default;
+
+  /// Add the outlined hash tree record. The input Record is released.
+  void addRecord(OutlinedHashTreeRecord &Record);
+
+  /// Write the codegen data to \c OS
+  Error write(raw_fd_ostream &OS);
+
+  /// Write the codegen data in text format to \c OS
+  Error writeText(raw_fd_ostream &OS);
+
+  /// Return the attributes of the current CGData.
+  CGDataKind getCGDataKind() const { return DataKind; }
+
+  /// Return true if the header indicates the data has an outlined hash tree.
+  bool hasOutlinedHashTree() const {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+
+private:
+  /// The offset of the outlined hash tree in the file.
+  uint64_t OutlinedHashTreeOffset;
+
+  /// Write the codegen data header to \c COS
+  Error writeHeader(CGDataOStream &COS);
+
+  /// Write the codegen data header in text to \c OS
+  Error writeHeaderText(raw_fd_ostream &OS);
+
+  Error writeImpl(CGDataOStream &COS);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_CODEGENDATAWRITER_H
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt
index 3ba90f96cc86d4..1156d53afb2e0f 100644
--- a/llvm/lib/CodeGenData/CMakeLists.txt
+++ b/llvm/lib/CodeGenData/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_llvm_component_library(LLVMCodeGenData
+  CodeGenData.cpp
+  CodeGenDataReader.cpp
+  CodeGenDataWriter.cpp
   OutlinedHashTree.cpp
   OutlinedHashTreeRecord.cpp
 
diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp
new file mode 100644
index 00000000000000..3bd21c97c7de7a
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenData.cpp
@@ -0,0 +1,197 @@
+//===-- CodeGenData.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for codegen data that has stable summary which
+// can be used to optimize the code in the subsequent codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/WithColor.h"
+
+#define DEBUG_TYPE "cg-data"
+
+using namespace llvm;
+using namespace cgdata;
+
+static std::string getCGDataErrString(cgdata_error Err,
+                                      const std::string &ErrMsg = "") {
+  std::string Msg;
+  raw_string_ostream OS(Msg);
+
+  switch (Err) {
+  case cgdata_error::success:
+    OS << "success";
+    break;
+  case cgdata_error::eof:
+    OS << "end of File";
+    break;
+  case cgdata_error::bad_magic:
+    OS << "invalid codegen data (bad magic)";
+    break;
+  case cgdata_error::bad_header:
+    OS << "invalid codegen data (file header is corrupt)";
+    break;
+  case cgdata_error::empty_cgdata:
+    OS << "empty codegen data";
+    break;
+  case cgdata_error::malformed:
+    OS << "malformed codegen data";
+    break;
+  case cgdata_error::unsupported_version:
+    OS << "unsupported codegen data version";
+    break;
+  }
+
+  // If optional error message is not empty, append it to the message.
+  if (!ErrMsg.empty())
+    OS << ": " << ErrMsg;
+
+  return OS.str();
+}
+
+namespace {
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class CGDataErrorCategoryType : public std::error_category {
+  const char *name() const noexcept override { return "llvm.cgdata"; }
+
+  std::string message(int IE) const override {
+    return getCGDataErrString(static_cast<cgdata_error>(IE));
+  }
+};
+
+} // end anonymous namespace
+
+const std::error_category &llvm::cgdata_category() {
+  static CGDataErrorCategoryType ErrorCategory;
+  return ErrorCategory;
+}
+
+std::string CGDataError::message() const {
+  return getCGDataErrString(Err, Msg);
+}
+
+char CGDataError::ID = 0;
+
+namespace {
+
+const char *CodeGenDataSectNameCommon[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
+  SectNameCommon,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+const char *CodeGenDataSectNameCoff[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
+  SectNameCoff,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+const char *CodeGenDataSectNamePrefix[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Prefix,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+} // namespace
+
+namespace llvm {
+
+std::string getCodeGenDataSectionName(CGDataSectKind CGSK,
+                                      Triple::ObjectFormatType OF,
+                                      bool AddSegmentInfo) {
+  std::string SectName;
+
+  if (OF == Triple::MachO && AddSegmentInfo)
+    SectName = CodeGenDataSectNamePrefix[CGSK];
+
+  if (OF == Triple::COFF)
+    SectName += CodeGenDataSectNameCoff[CGSK];
+  else
+    SectName += CodeGenDataSectNameCommon[CGSK];
+
+  return SectName;
+}
+
+std::unique_ptr<CodeGenData> CodeGenData::Instance = nullptr;
+std::once_flag CodeGenData::OnceFlag;
+
+CodeGenData &CodeGenData::getInstance() {
+  std::call_once(CodeGenData::OnceFlag, []() {
+    auto *CGD = new CodeGenData();
+    Instance.reset(CGD);
+
+    // TODO: Initialize writer or reader mode for the client optimization.
+  });
+  return *(Instance.get());
+}
+
+namespace IndexedCGData {
+
+Expected<Header> Header::readFromBuffer(const unsigned char *Curr) {
+  using namespace support;
+
+  static_assert(std::is_standard_layout_v<llvm::IndexedCGData::Header>,
+                "The header should be standard layout type since we use offset "
+                "of fields to read.");
+  Header H;
+  H.Magic = endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
+  if (H.Magic != IndexedCGData::Magic)
+    return make_error<CGDataError>(cgdata_error::bad_magic);
+  H.Version = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
+  if (H.Version > IndexedCGData::CGDataVersion::CurrentVersion)
+    return make_error<CGDataError>(cgdata_error::unsupported_version);
+  H.DataKind = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
+
+  switch (H.Version) {
+    // When a new field is added to the header add a case statement here to
+    // compute the size as offset of the new field + size of the new field. This
+    // relies on the field being added to the end of the list.
+    static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version1,
+                  "Please update the size computation below if a new field has "
+                  "been added to the header, if not add a case statement to "
+                  "fall through to the latest version.");
+  case 1ull:
+    H.OutlinedHashTreeOffset =
+        endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
+  }
+
+  return H;
+}
+
+} // end namespace IndexedCGData
+
+namespace cgdata {
+
+void warn(Twine Message, std::string Whence, std::string Hint) {
+  WithColor::warning();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+}
+
+void warn(Error E, StringRef Whence) {
+  if (E.isA<CGDataError>()) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      warn(IPE.message(), std::string(Whence), std::string(""));
+    });
+  }
+}
+
+} // end namespace cgdata
+
+} // end namespace llvm
diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
new file mode 100644
index 00000000000000..1b08085dec2f25
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
@@ -0,0 +1,174 @@
+//===- CodeGenDataReader.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+#define DEBUG_TYPE "cg-data-reader"
+
+using namespace llvm;
+
+namespace llvm {
+
+static Expected<std::unique_ptr<MemoryBuffer>>
+setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
+  auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN()
+                                           : FS.getBufferForFile(Filename);
+  if (std::error_code EC = BufferOrErr.getError())
+    return errorCodeToError(EC);
+  return std::move(BufferOrErr.get());
+}
+
+Error CodeGenDataReader::mergeFromObjectFile(
+    const object::ObjectFile *Obj,
+    OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  Triple TT = Obj->makeTriple();
+  auto CGOutLineName =
+      getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false);
+
+  for (auto &Section : Obj->sections()) {
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    Expected<StringRef> ContentsOrErr = Section.getContents();
+    if (!ContentsOrErr)
+      return ContentsOrErr.takeError();
+    auto *Data = reinterpret_cast<const unsigned char *>(ContentsOrErr->data());
+    auto *EndData = Data + ContentsOrErr->size();
+
+    if (*NameOrErr == CGOutLineName) {
+      // In case dealing with an executable that has concatenaed cgdata,
+      // we want to merge them into a single cgdata.
+      // Although it's not a typical workflow, we support this scenario.
+      while (Data != EndData) {
+        OutlinedHashTreeRecord LocalOutlineRecord;
+        LocalOutlineRecord.deserialize(Data);
+        GlobalOutlineRecord.merge(LocalOutlineRecord);
+      }
+    }
+    // TODO: Add support for other cgdata sections.
+  }
+
+  return Error::success();
+}
+
+Error IndexedCodeGenDataReader::read() {
+  using namespace support;
+
+  // The smallest header with the version 1 is 24 bytes
+  const unsigned MinHeaderSize = 24;
+  if (DataBuffer->getBufferSize() < MinHeaderSize)
+    return error(cgdata_error::bad_header);
+
+  auto *Start =
+      reinterpret_cast<const unsigned char *>(DataBuffer->getBufferStart());
+  auto *End =
+      reinterpret_cast<const unsigned char *>(DataBuffer->getBufferEnd());
+  auto HeaderOr = IndexedCGData::Header::readFromBuffer(Start);
+  if (!HeaderOr)
+    return HeaderOr.takeError();
+  Header = HeaderOr.get();
+
+  if (hasOutlinedHashTree()) {
+    const unsigned char *Ptr = Start + Header.OutlinedHashTreeOffset;
+    if (Ptr >= End)
+      return error(cgdata_error::eof);
+    HashTreeRecord.deserialize(Ptr);
+  }
+
+  return success();
+}
+
+Expected<std::unique_ptr<CodeGenDataReader>>
+CodeGenDataReader::create(const Twine &Path, vfs::FileSystem &FS) {
+  // Set up the buffer to read.
+  auto BufferOrError = setupMemoryBuffer(Path, FS);
+  if (Error E = BufferOrError.takeError())
+    return std::move(E);
+  return CodeGenDataReader::create(std::move(BufferOrError.get()));
+}
+
+Expected<std::unique_ptr<CodeGenDataReader>>
+CodeGenDataReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+  if (Buffer->getBufferSize() == 0)
+    return make_error<CGDataError>(cgdata_error::empty_cgdata);
+
+  std::unique_ptr<CodeGenDataReader> Reader;
+  // Create the reader.
+  if (IndexedCodeGenDataReader::hasFormat(*Buffer))
+    Reader.reset(new IndexedCodeGenDataReader(std::move(Buffer)));
+  else if (TextCodeGenDataReader::hasFormat(*Buffer))
+    Reader.reset(new TextCodeGenDataReader(std::move(Buffer)));
+  else
+    return make_error<CGDataError>(cgdata_error::malformed);
+
+  // Initialize the reader and return the result.
+  if (Error E = Reader->read())
+    return std::move(E);
+
+  return std::move(Reader);
+}
+
+bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) {
+  using namespace support;
+  if (DataBuffer.getBufferSize() < 8)
+    return false;
+
+  uint64_t Magic = endian::read<uint64_t, llvm::endianness::little, aligned>(
+      DataBuffer.getBufferStart());
+  // Verify that it's magical.
+  return Magic == IndexedCGData::Magic;
+}
+
+bool TextCodeGenDataReader::hasFormat(const MemoryBuffer &Buffer) {
+  // Verify that this really looks like plain ASCII text by checking a
+  // 'reasonable' number of characters (up to profile magic size).
+  size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t));
+  StringRef buffer = Buffer.getBufferStart();
+  return count == 0 ||
+         std::all_of(buffer.begin(), buffer.begin() + count,
+                     [](char c) { return isPrint(c) || isSpace(c); });
+}
+Error TextCodeGenDataReader::read() {
+  using namespace support;
+
+  // Parse the custom header line by line.
+  while (Line->starts_with(":")) {
+    StringRef Str = Line->substr(1);
+    if (Str.equals_insensitive("outlined_hash_tree"))
+      DataKind |= CGDataKind::FunctionOutlinedHashTree;
+    else
+      return error(cgdata_error::bad_header);
+    ++Line;
+  }
+
+  // We treat an empty header (that as a comment # only) as a valid header.
+  if (Line.is_at_eof()) {
+    if (DataKind != CGDataKind::Unknown)
+      return error(cgdata_error::bad_header);
+    return Error::success();
+  }
+
+  // The YAML docs follow after the header.
+  const char *Pos = (*Line).data();
+  size_t Size = reinterpret_cast<size_t>(DataBuffer->getBufferEnd()) -
+                reinterpret_cast<size_t>(Pos);
+  yaml::Input YOS(StringRef(Pos, Size));
+  if (hasOutlinedHashTree())
+    HashTreeRecord.deserializeYAML(YOS);
+
+  // TODO: Add more yaml cgdata in order
+
+  return Error::success();
+}
+} // end namespace llvm
diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
new file mode 100644
index 00000000000000..2b6b87d66f4563
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
@@ -0,0 +1,161 @@
+//===- CodeGenDataWriter.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+#define DEBUG_TYPE "cg-data-writer"
+
+using namespace llvm;
+
+namespace llvm {
+
+/// A struct to define how the data stream should be patched.
+struct CGDataPatchItem {
+  uint64_t Pos; // Where to patch.
+  uint64_t *D;  // Pointer to an array of source data.
+  int N;        // Number of elements in \c D array.
+};
+
+// A wrapper class to abstract writer stream with support of bytes
+// back patching.
+class CGDataOStream {
+public:
+  CGDataOStream(raw_fd_ostream &FD)
+      : IsFDOStream(true), OS(FD), LE(FD, llvm::endianness::little) {}
+  CGDataOStream(raw_string_ostream &STR)
+      : IsFDOStream(false), OS(STR), LE(STR, llvm::endianness::little) {}
+
+  uint64_t tell() { return OS.tell(); }
+  void write(uint64_t V) { LE.write<uint64_t>(V); }
+  void write32(uint32_t V) { LE.write<uint32_t>(V); }
+  void write8(uint8_t V) { LE.write<uint8_t>(V); }
+
+  // \c patch can only be called when all data is written and flushed.
+  // For raw_string_ostream, the patch is done on the target string
+  // directly and it won't be reflected in the stream's internal buffer.
+  void patch(ArrayRef<CGDataPatchItem> P) {
+    using namespace support;
+
+    if (IsFDOStream) {
+      raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
+      const uint64_t LastPos = FDOStream.tell();
+      for (const auto &K : P) {
+        FDOStream.seek(K.Pos);
+        for (int I = 0; I < K.N; I++)
+          write(K.D[I]);
+      }
+      // Reset the stream to the last position after patching so that users
+      // don't accidentally overwrite data. This makes it consistent with
+      // the string stream below which replaces the data directly.
+      FDOStream.seek(LastPos);
+    } else {
+      raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
+      std::string &Data = SOStream.str(); // with flush
+      for (const auto &K : P) {
+        for (int I = 0; I < K.N; I++) {
+          uint64_t Bytes =
+              endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+          Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
+                       (const char *)&Bytes, sizeof(uint64_t));
+        }
+      }
+    }
+  }
+
+  // If \c OS is an instance of \c raw_fd_ostream, this field will be
+  // true. Otherwise, \c OS will be an raw_string_ostream.
+  bool IsFDOStream;
+  raw_ostream &OS;
+  support::endian::Writer LE;
+};
+
+} // end namespace llvm
+
+void CodeGenDataWriter::addRecord(OutlinedHashTreeRecord &Record) {
+  assert(Record.HashTree && "empty hash tree in the record");
+  HashTreeRecord.HashTree = std::move(Record.HashTree);
+
+  DataKind |= CGDataKind::FunctionOutlinedHashTree;
+}
+
+Error CodeGenDataWriter::write(raw_fd_ostream &OS) {
+  CGDataOStream COS(OS);
+  return writeImpl(COS);
+}
+
+Error CodeGenDataWriter::writeHeader(CGDataOStream &COS) {
+  using namespace support;
+  IndexedCGData::Header Header;
+  Header.Magic = IndexedCGData::Magic;
+  Header.Version = IndexedCGData::Version;
+
+  // Set the CGDataKind depending on the kind.
+  if (static_cast<bool>(DataKind & CGDataKind::FunctionOutlinedHashTree))
+    Header.DataKind |=
+        static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+
+  Header.OutlinedHashTreeOffset = 0;
+
+  // Only write out up to the CGDataKind. We need to remember the offest of the
+  // remaing fields to allow back patching later.
+  COS.write(Header.Magic);
+  COS.write32(Header.Version);
+  COS.write32(Header.DataKind);
+
+  // Save the location of Header.OutlinedHashTreeOffset field in \c COS.
+  OutlinedHashTreeOffset = COS.tell();
+
+  // Reserve the space for OutlinedHashTreeOffset field.
+  COS.write(0);
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeImpl(CGDataOStream &COS) {
+  if (Error E = writeHeader(COS))
+    return E;
+
+  uint64_t OutlinedHashTreeFieldStart = COS.tell();
+  if (hasOutlinedHashTree())
+    HashTreeRecord.serialize(COS.OS);
+
+  // Back patch the offsets.
+  CGDataPatchItem PatchItems[] = {
+      {OutlinedHashTreeOffset, &OutlinedHashTreeFieldStart, 1}};
+  COS.patch(PatchItems);
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeHeaderText(raw_fd_ostream &OS) {
+  if (hasOutlinedHashTree())
+    OS << "# Outlined stable hash tree\n:outlined_hash_tree\n";
+
+  // TODO: Add more data types in this header
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeText(raw_fd_ostream &OS) {
+  if (Error E = writeHeaderText(OS))
+    return E;
+
+  yaml::Output YOS(OS);
+  if (hasOutlinedHashTree())
+    HashTreeRecord.serializeYAML(YOS);
+
+  // TODO: Write more yaml cgdata in order
+
+  return Error::success();
+}
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 6127b76db06b7f..be777ce650e874 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -73,6 +73,7 @@ set(LLVM_TEST_DEPENDS
           llvm-c-test
           llvm-cat
           llvm-cfi-verify
+          llvm-cgdata
           llvm-config
           llvm-cov
           llvm-cvtres
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 4c05317036d1a3..cc7e9d535a9c33 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -180,6 +180,7 @@ def get_asan_rtlib():
         "llvm-addr2line",
         "llvm-bcanalyzer",
         "llvm-bitcode-strip",
+        "llvm-cgdata",
         "llvm-config",
         "llvm-cov",
         "llvm-cxxdump",
diff --git a/llvm/test/tools/llvm-cgdata/dump.test b/llvm/test/tools/llvm-cgdata/dump.test
new file mode 100644
index 00000000000000..971cf0a795e211
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/dump.test
@@ -0,0 +1,31 @@
+# Test dump between the binary and text formats.
+
+RUN: split-file %s %t
+
+RUN: llvm-cgdata dump -binary %t/dump.cgtext -o %t/dump.cgdata
+RUN: llvm-cgdata dump -text %t/dump.cgdata -o %t/dump-round.cgtext
+RUN: llvm-cgdata dump -binary %t/dump.cgdata -o %t/dump-round.cgdata
+RUN: diff %t/dump.cgtext %t/dump-round.cgtext
+RUN: diff %t/dump.cgdata %t/dump-round.cgdata
+
+;--- dump.cgtext
+# Outlined stable hash tree
+:outlined_hash_tree
+---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2, 3 ]
+2:
+  Hash:            0x3
+  Terminals:       5
+  SuccessorIds:    [  ]
+3:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
new file mode 100644
index 00000000000000..d5e201b9eec17f
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -0,0 +1,32 @@
+# Test for empty cgdata file, which is invalid.
+RUN: touch %t_emptyfile.cgtext
+RUN: not llvm-cgdata dump %t_emptyfile.cgtext -text -o - 2>&1 | FileCheck %s --check-prefix ERROR
+ERROR: {{.}}emptyfile.cgtext: empty codegen data
+
+# Test for empty header in the text format. It can be converted to a valid binary file.
+RUN: printf '#' > %t_emptyheader.cgtext
+RUN: llvm-cgdata dump %t_emptyheader.cgtext -binary -o %t_emptyheader.cgdata
+
+# Without any cgdata other than the header, no data shows by default.
+RUN: llvm-cgdata show %t_emptyheader.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
+EMPTY-NOT: any
+
+# The version number appears when asked, as it's in the header
+RUN: llvm-cgdata show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix VERSION
+VERSION: Version: {{.}}
+
+# When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
+RUN: llvm-cgdata dump %t_emptyheader.cgdata -text -o - | FileCheck %s --allow-empty --check-prefix EMPTY
+
+# Synthesize a header only cgdata.
+# struct Header {
+#   uint64_t Magic;
+#   uint32_t Version;
+#   uint32_t DataKind;
+#   uint64_t OutlinedHashTreeOffset;
+# }
+RUN: printf '\xffcgdata\x81' > %t_header.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: diff %t_header.cgdata %t_emptyheader.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
new file mode 100644
index 00000000000000..5e1b14de5e509d
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -0,0 +1,38 @@
+# Test various error cases
+
+# Synthesize a header only cgdata.
+# struct Header {
+#   uint64_t Magic;
+#   uint32_t Version;
+#   uint32_t DataKind;
+#   uint64_t OutlinedHashTreeOffset;
+# }
+RUN: touch %t_empty.cgdata
+RUN: not llvm-cgdata show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix EMPTY
+EMPTY: {{.}}cgdata: empty codegen data
+
+# Not a magic.
+RUN: printf '\xff' > %t_malformed.cgdata
+RUN: not llvm-cgdata show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix MALFORMED
+MALFORMED: {{.}}cgdata: malformed codegen data
+
+# The minimum header size is 24.
+RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
+RUN: not llvm-cgdata show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix CORRUPT
+CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
+
+# The current version 1 while the header says 2.
+RUN: printf '\xffcgdata\x81' > %t_version.cgdata
+RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: not llvm-cgdata show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix BAD_VERSION
+BAD_VERSION: {{.}}cgdata: unsupported codegen data version
+
+# Header says an outlined hash tree, but the file ends after the header.
+RUN: printf '\xffcgdata\x81' > %t_eof.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
+RUN: not llvm-cgdata show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix EOF
+EOF: {{.}}cgdata: end of File
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
new file mode 100644
index 00000000000000..a27d6c2a16f4ab
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -0,0 +1,75 @@
+# Merge an archive that has two object files having cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
+RUN: llvm-cgdata merge %t/merge-archive.a -o %t/merge-archive.cgdata
+RUN: llvm-cgdata show %t/merge-archive.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge-archive.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-1.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
+;--- merge-2.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
new file mode 100644
index 00000000000000..3411133cb7aacb
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -0,0 +1,68 @@
+# Merge a binary file (e.g., a linked executable) having concatnated cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
+RUN: llvm-cgdata merge %t/merge-concat.o -o %t/merge-concat.cgdata
+RUN: llvm-cgdata show %t/merge-concat.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge-concat.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-concat.ll
+
+; In an linked executable (as opposed to an object file), cgdata in __llvm_outline might be concatenated. Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated. In other word, the following two trees are encoded back-to-back in a binary format.
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data1 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+ at .data2 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
new file mode 100644
index 00000000000000..6ce358cd72325b
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -0,0 +1,74 @@
+# Merge two object files having cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+RUN: llvm-cgdata show %t/merge.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-1.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
+;--- merge-2.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
new file mode 100644
index 00000000000000..73bdd9800dbe1d
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -0,0 +1,43 @@
+# Test merge a single object file into a cgdata
+
+RUN: split-file %s %t
+
+# Merge an object file that has no cgdata (__llvm_outline). It still produces a header only cgdata.
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-empty.ll -o %t/merge-empty.o
+RUN: llvm-cgdata merge %t/merge-empty.o -o %t/merge-empty.cgdata
+RUN: llvm-cgdata show %t/merge-empty.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
+EMPTY-NOT: any
+
+
+# Merge an object file having cgdata (__llvm_outline)
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
+RUN: llvm-cgdata merge %t/merge-single.o -o %t/merge-single.cgdata
+RUN: llvm-cgdata show %t/merge-single.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 3
+CHECK-NEXT:  Terminal Node Count: 1
+CHECK-NEXT:  Depth: 2
+
+;--- merge-empty.ll
+ at .data = private unnamed_addr constant [1 x i8] c"\01"
+
+;--- merge-single.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
diff --git a/llvm/test/tools/llvm-cgdata/show.test b/llvm/test/tools/llvm-cgdata/show.test
new file mode 100644
index 00000000000000..accb4b77ede246
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/show.test
@@ -0,0 +1,30 @@
+# Test show
+
+RUN: split-file %s %t
+RUN: llvm-cgdata show %t/show.cgtext | FileCheck %s
+
+CHECK: Outlined hash tree:
+CHECK-NEXT:   Total Node Count: 3
+CHECK-NEXT:   Terminal Node Count: 1
+CHECK-NEXT:   Depth: 2
+
+# Convert the text file to the binary file
+RUN: llvm-cgdata dump -binary %t/show.cgtext -o %t/show.cgdata
+RUN: llvm-cgdata show %t/show.cgdata | FileCheck %s
+
+;--- show.cgtext
+:outlined_hash_tree
+---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       3
+  SuccessorIds:    [  ]
+...
diff --git a/llvm/tools/llvm-cgdata/CMakeLists.txt b/llvm/tools/llvm-cgdata/CMakeLists.txt
new file mode 100644
index 00000000000000..4f1f7ff635bc3c
--- /dev/null
+++ b/llvm/tools/llvm-cgdata/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(LLVM_LINK_COMPONENTS
+  CodeGen
+  CodeGenData
+  Core
+  Object
+  Support
+  )
+
+add_llvm_tool(llvm-cgdata
+  llvm-cgdata.cpp
+
+  DEPENDS
+  intrinsics_gen
+  GENERATE_DRIVER
+  )
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
new file mode 100644
index 00000000000000..f6189d4be76f5b
--- /dev/null
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -0,0 +1,270 @@
+//===-- llvm-cgdata.cpp - LLVM CodeGen Data Tool --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// llvm-cgdata parses raw codegen data embedded in compiled binary files, and
+// merges them into a single .cgdata file. It can also inspect and maninuplate
+// a .cgdata file. This .cgdata can contain various codegen data like outlining
+// information, and it can be used to optimize the code in the subsequent build.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LLVMDriver.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+// TODO: https://llvm.org/docs/CommandGuide/llvm-cgdata.html has documentations
+// on each subcommand.
+cl::SubCommand DumpSubcommand(
+    "dump",
+    "Dump the (indexed) codegen data file in either text or binary format.");
+cl::SubCommand MergeSubcommand(
+    "merge", "Takes binary files having raw codegen data in custom sections, "
+             "and merge them into an index codegen data file.");
+cl::SubCommand
+    ShowSubcommand("show", "Show summary of the (indexed) codegen data file.");
+
+enum CGDataFormat {
+  CD_None = 0,
+  CD_Text,
+  CD_Binary,
+};
+
+cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                    cl::init("-"), cl::desc("Output file"),
+                                    cl::sub(DumpSubcommand),
+                                    cl::sub(MergeSubcommand));
+cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                          cl::aliasopt(OutputFilename));
+
+cl::opt<std::string> Filename(cl::Positional, cl::desc("<cgdata-file>"),
+                              cl::sub(DumpSubcommand), cl::sub(ShowSubcommand));
+cl::list<std::string> InputFilenames(cl::Positional, cl::sub(MergeSubcommand),
+                                     cl::desc("<binary-files...>"));
+cl::opt<CGDataFormat> OutputFormat(
+    cl::desc("Format of output data"), cl::sub(DumpSubcommand),
+    cl::init(CD_Text),
+    cl::values(clEnumValN(CD_Text, "text", "Text encoding"),
+               clEnumValN(CD_Binary, "binary", "Binary encoding")));
+
+cl::opt<bool> ShowCGDataVersion("cgdata-version", cl::init(false),
+                                cl::desc("Show cgdata version. "),
+                                cl::sub(ShowSubcommand));
+
+static void exitWithError(Twine Message, std::string Whence = "",
+                          std::string Hint = "") {
+  WithColor::error();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+  ::exit(1);
+}
+
+static void exitWithError(Error E, StringRef Whence = "") {
+  if (E.isA<CGDataError>()) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      exitWithError(IPE.message(), std::string(Whence));
+    });
+    return;
+  }
+
+  exitWithError(toString(std::move(E)), std::string(Whence));
+}
+
+static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") {
+  exitWithError(EC.message(), std::string(Whence));
+}
+
+static int dump_main(int argc, const char *argv[]) {
+  if (Filename == OutputFilename) {
+    errs() << sys::path::filename(argv[0]) << " " << argv[1]
+           << ": Input file name cannot be the same as the output file name!\n";
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC,
+                    OutputFormat == CD_Text ? sys::fs::OF_TextWithCRLF
+                                            : sys::fs::OF_None);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  CodeGenDataWriter Writer;
+  auto Reader = ReaderOrErr->get();
+  if (Reader->hasOutlinedHashTree()) {
+    OutlinedHashTreeRecord Record(Reader->releaseOutlinedHashTree());
+    Writer.addRecord(Record);
+  }
+
+  if (OutputFormat == CD_Text) {
+    if (Error E = Writer.writeText(OS))
+      exitWithError(std::move(E));
+  } else {
+    if (Error E = Writer.write(OS))
+      exitWithError(std::move(E));
+  }
+  OS.flush();
+
+  return 0;
+}
+
+static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
+                         OutlinedHashTreeRecord &GlobalOutlineRecord);
+
+static bool handleArchive(StringRef Filename, Archive &Arch,
+                          OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  bool Result = true;
+  Error Err = Error::success();
+  for (const auto &Child : Arch.children(Err)) {
+    auto BuffOrErr = Child.getMemoryBufferRef();
+    if (Error E = BuffOrErr.takeError())
+      exitWithError(std::move(E), Filename);
+    auto NameOrErr = Child.getName();
+    if (Error E = NameOrErr.takeError())
+      exitWithError(std::move(E), Filename);
+    std::string Name = (Filename + "(" + NameOrErr.get() + ")").str();
+    Result &= handleBuffer(Name, BuffOrErr.get(), GlobalOutlineRecord);
+  }
+  if (Err)
+    exitWithError(std::move(Err), Filename);
+  return Result;
+}
+
+static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
+                         OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  Expected<std::unique_ptr<Binary>> BinOrErr = object::createBinary(Buffer);
+  if (Error E = BinOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  bool Result = true;
+  if (auto *Obj = dyn_cast<ObjectFile>(BinOrErr->get())) {
+    if (Error E =
+            CodeGenDataReader::mergeFromObjectFile(Obj, GlobalOutlineRecord))
+      exitWithError(std::move(E), Filename);
+  } else if (auto *Arch = dyn_cast<Archive>(BinOrErr->get())) {
+    Result &= handleArchive(Filename, *Arch, GlobalOutlineRecord);
+  } else {
+    // TODO: Support for the MachO universal binary format.
+    errs() << "Error: unsupported binary file: " << Filename << "\n";
+    Result = false;
+  }
+
+  return Result;
+}
+
+static bool handleFile(StringRef Filename,
+                       OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = BuffOrErr.getError())
+    exitWithErrorCode(EC, Filename);
+  return handleBuffer(Filename, *BuffOrErr.get(), GlobalOutlineRecord);
+}
+
+static int merge_main(int argc, const char *argv[]) {
+  bool Result = true;
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &Filename : InputFilenames)
+    Result &= handleFile(Filename, GlobalOutlineRecord);
+
+  if (!Result) {
+    errs() << "Error: failed to merge codegen data files.\n";
+    return 1;
+  }
+
+  CodeGenDataWriter Writer;
+  if (!GlobalOutlineRecord.empty())
+    Writer.addRecord(GlobalOutlineRecord);
+
+  std::error_code EC;
+  raw_fd_ostream Output(OutputFilename, EC, sys::fs::OF_None);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  if (auto E = Writer.write(Output))
+    exitWithError(std::move(E));
+  Output.flush();
+
+  return 0;
+}
+
+static int show_main(int argc, const char *argv[]) {
+  if (Filename == OutputFilename) {
+    errs() << sys::path::filename(argv[0]) << " " << argv[1]
+           << ": Input file name cannot be the same as the output file name!\n";
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = ReaderOrErr->get();
+  if (ShowCGDataVersion)
+    OS << "Version: " << Reader->getVersion() << "\n";
+
+  if (Reader->hasOutlinedHashTree()) {
+    auto Tree = std::move(Reader->releaseOutlinedHashTree());
+    OS << "Outlined hash tree:\n";
+    OS << "  Total Node Count: " << Tree->size() << "\n";
+    OS << "  Terminal Node Count: " << Tree->size(/*GetTerminalCountOnly=*/true)
+       << "\n";
+    OS << "  Depth: " << Tree->depth() << "\n";
+  }
+
+  return 0;
+}
+
+int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) {
+  const char **argv = const_cast<const char **>(argvNonConst);
+
+  StringRef ProgName(sys::path::filename(argv[0]));
+
+  if (argc < 2) {
+    errs() << ProgName
+           << ": No subcommand specified! Run llvm-cgdata --help for usage.\n";
+    return 1;
+  }
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM codegen data\n");
+
+  if (DumpSubcommand)
+    return dump_main(argc, argv);
+
+  if (MergeSubcommand)
+    return merge_main(argc, argv);
+
+  if (ShowSubcommand)
+    return show_main(argc, argv);
+
+  errs() << ProgName
+         << ": Unknown command. Run llvm-cgdata --help for usage.\n";
+  return 1;
+}