[llvm] Reland [CGData] llvm-cgdata #89884 (PR #101461)

Fri Aug 16 10:25:57 PDT 2024

https://github.com/kyulee-com updated https://github.com/llvm/llvm-project/pull/101461

>From d14066be2825efb8fc301da9fa828948388d7ee5 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Tue, 23 Apr 2024 14:22:14 -0700
Subject: [PATCH 01/12] [CGData] llvm-cgdata

The llvm-cgdata tool has been introduced to handle reading and writing of codegen data. This data includes an optimistic codegen summary that can be utilized to enhance subsequent codegen. Currently, the tool supports saving and restoring the outlined hash tree, facilitating machine function outlining across modules. Additional codegen summaries can be incorporated into separate sections as required. This patch primarily establishes basic support for the reader and writer, similar to llvm-profdata.

The high-level operations of llvm-cgdata are as follows:
1. It reads local raw codegen data from a custom section (for example, __llvm_outline)  embedded in native binary files
2. It merges local raw codegen data into an indexed codegen data, complete with a suitable header.
3. It handles reading and writing of the indexed codegen data into a standalone file.
---
 llvm/include/llvm/CodeGenData/CodeGenData.h   | 202 +++++++++++++
 llvm/include/llvm/CodeGenData/CodeGenData.inc |  46 +++
 .../llvm/CodeGenData/CodeGenDataReader.h      | 154 ++++++++++
 .../llvm/CodeGenData/CodeGenDataWriter.h      |  68 +++++
 llvm/lib/CodeGenData/CMakeLists.txt           |   3 +
 llvm/lib/CodeGenData/CodeGenData.cpp          | 197 +++++++++++++
 llvm/lib/CodeGenData/CodeGenDataReader.cpp    | 174 ++++++++++++
 llvm/lib/CodeGenData/CodeGenDataWriter.cpp    | 162 +++++++++++
 llvm/test/CMakeLists.txt                      |   1 +
 llvm/test/lit.cfg.py                          |   1 +
 llvm/test/tools/llvm-cgdata/dump.test         |  30 ++
 llvm/test/tools/llvm-cgdata/empty.test        |  32 +++
 llvm/test/tools/llvm-cgdata/error.test        |  38 +++
 .../test/tools/llvm-cgdata/merge-archive.test |  75 +++++
 llvm/test/tools/llvm-cgdata/merge-concat.test |  68 +++++
 llvm/test/tools/llvm-cgdata/merge-double.test |  74 +++++
 llvm/test/tools/llvm-cgdata/merge-single.test |  43 +++
 llvm/test/tools/llvm-cgdata/show.test         |  30 ++
 llvm/tools/llvm-cgdata/CMakeLists.txt         |  15 +
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        | 268 ++++++++++++++++++
 20 files changed, 1681 insertions(+)
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenData.h
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenData.inc
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenDataReader.h
 create mode 100644 llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
 create mode 100644 llvm/lib/CodeGenData/CodeGenData.cpp
 create mode 100644 llvm/lib/CodeGenData/CodeGenDataReader.cpp
 create mode 100644 llvm/lib/CodeGenData/CodeGenDataWriter.cpp
 create mode 100644 llvm/test/tools/llvm-cgdata/dump.test
 create mode 100644 llvm/test/tools/llvm-cgdata/empty.test
 create mode 100644 llvm/test/tools/llvm-cgdata/error.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-archive.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-concat.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-double.test
 create mode 100644 llvm/test/tools/llvm-cgdata/merge-single.test
 create mode 100644 llvm/test/tools/llvm-cgdata/show.test
 create mode 100644 llvm/tools/llvm-cgdata/CMakeLists.txt
 create mode 100644 llvm/tools/llvm-cgdata/llvm-cgdata.cpp

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h
new file mode 100644
index 00000000000000..118fb9841d27e8
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.h
@@ -0,0 +1,202 @@
+//===- CodeGenData.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for codegen data that has stable summary which
+// can be used to optimize the code in the subsequent codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATA_H
+#define LLVM_CODEGENDATA_CODEGENDATA_H
+
+#include "llvm/ADT/BitmaskEnum.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/TargetParser/Triple.h"
+#include <mutex>
+
+namespace llvm {
+
+enum CGDataSectKind {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Kind,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+std::string getCodeGenDataSectionName(CGDataSectKind CGSK,
+                                      Triple::ObjectFormatType OF,
+                                      bool AddSegmentInfo = true);
+
+enum class CGDataKind {
+  Unknown = 0x0,
+  // A function outlining info.
+  FunctionOutlinedHashTree = 0x1,
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/FunctionOutlinedHashTree)
+};
+
+const std::error_category &cgdata_category();
+
+enum class cgdata_error {
+  success = 0,
+  eof,
+  bad_magic,
+  bad_header,
+  empty_cgdata,
+  malformed,
+  unsupported_version,
+};
+
+inline std::error_code make_error_code(cgdata_error E) {
+  return std::error_code(static_cast<int>(E), cgdata_category());
+}
+
+class CGDataError : public ErrorInfo<CGDataError> {
+public:
+  CGDataError(cgdata_error Err, const Twine &ErrStr = Twine())
+      : Err(Err), Msg(ErrStr.str()) {
+    assert(Err != cgdata_error::success && "Not an error");
+  }
+
+  std::string message() const override;
+
+  void log(raw_ostream &OS) const override { OS << message(); }
+
+  std::error_code convertToErrorCode() const override {
+    return make_error_code(Err);
+  }
+
+  cgdata_error get() const { return Err; }
+  const std::string &getMessage() const { return Msg; }
+
+  /// Consume an Error and return the raw enum value contained within it, and
+  /// the optional error message. The Error must either be a success value, or
+  /// contain a single CGDataError.
+  static std::pair<cgdata_error, std::string> take(Error E) {
+    auto Err = cgdata_error::success;
+    std::string Msg = "";
+    handleAllErrors(std::move(E), [&Err, &Msg](const CGDataError &IPE) {
+      assert(Err == cgdata_error::success && "Multiple errors encountered");
+      Err = IPE.get();
+      Msg = IPE.getMessage();
+    });
+    return {Err, Msg};
+  }
+
+  static char ID;
+
+private:
+  cgdata_error Err;
+  std::string Msg;
+};
+
+enum CGDataMode {
+  None,
+  Read,
+  Write,
+};
+
+class CodeGenData {
+  /// Global outlined hash tree that has oulined hash sequences across modules.
+  std::unique_ptr<OutlinedHashTree> PublishedHashTree;
+
+  /// This flag is set when -fcgdata-generate is passed.
+  /// Or, it can be mutated with -ftwo-codegen-rounds during two codegen runs.
+  bool EmitCGData;
+
+  /// This is a singleton instance which is thread-safe. Unlike profile data
+  /// which is largely function-based, codegen data describes the whole module.
+  /// Therefore, this can be initialized once, and can be used across modules
+  /// instead of constructing the same one for each codegen backend.
+  static std::unique_ptr<CodeGenData> Instance;
+  static std::once_flag OnceFlag;
+
+  CodeGenData() = default;
+
+public:
+  ~CodeGenData() = default;
+
+  static CodeGenData &getInstance();
+
+  /// Returns true if we have a valid outlined hash tree.
+  bool hasOutlinedHashTree() {
+    return PublishedHashTree && !PublishedHashTree->empty();
+  }
+
+  /// Returns the outlined hash tree. This can be globally used in a read-only
+  /// manner.
+  const OutlinedHashTree *getOutlinedHashTree() {
+    return PublishedHashTree.get();
+  }
+
+  /// Returns true if we should write codegen data.
+  bool emitCGData() { return EmitCGData; }
+
+  /// Publish the (globally) merged or read outlined hash tree.
+  void publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
+    PublishedHashTree = std::move(HashTree);
+    // Ensure we disable emitCGData as we do not want to read and write both.
+    EmitCGData = false;
+  }
+};
+
+namespace cgdata {
+
+inline bool hasOutlinedHashTree() {
+  return CodeGenData::getInstance().hasOutlinedHashTree();
+}
+
+inline const OutlinedHashTree *getOutlinedHashTree() {
+  return CodeGenData::getInstance().getOutlinedHashTree();
+}
+
+inline bool emitCGData() { return CodeGenData::getInstance().emitCGData(); }
+
+inline void
+publishOutlinedHashTree(std::unique_ptr<OutlinedHashTree> HashTree) {
+  CodeGenData::getInstance().publishOutlinedHashTree(std::move(HashTree));
+}
+
+void warn(Error E, StringRef Whence = "");
+void warn(Twine Message, std::string Whence = "", std::string Hint = "");
+
+} // end namespace cgdata
+
+namespace IndexedCGData {
+
+const uint64_t Magic = 0x81617461646763ff; // "\xffcgdata\x81"
+
+enum CGDataVersion {
+  // Version 1 is the first version. This version support the outlined
+  // hash tree.
+  Version1 = 1,
+  CurrentVersion = CG_DATA_INDEX_VERSION
+};
+const uint64_t Version = CGDataVersion::CurrentVersion;
+
+struct Header {
+  uint64_t Magic;
+  uint32_t Version;
+  uint32_t DataKind;
+  uint64_t OutlinedHashTreeOffset;
+
+  // New fields should only be added at the end to ensure that the size
+  // computation is correct. The methods below need to be updated to ensure that
+  // the new field is read correctly.
+
+  // Reads a header struct from the buffer.
+  static Expected<Header> readFromBuffer(const unsigned char *Curr);
+};
+
+} // end namespace IndexedCGData
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_PREPARE_H
diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.inc b/llvm/include/llvm/CodeGenData/CodeGenData.inc
new file mode 100644
index 00000000000000..5f6df5c0bf1065
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.inc
@@ -0,0 +1,46 @@
+/*===-- CodeGenData.inc ----------------------------------------*- C++ -*-=== *\
+|*
+|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+|* See https://llvm.org/LICENSE.txt for license information.
+|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+|*
+\*===----------------------------------------------------------------------===*/
+/*
+ * This is the main file that defines all the data structure, signature,
+ * constant literals that are shared across compiler, host tools (reader/writer)
+ * to support codegen data.
+ *
+\*===----------------------------------------------------------------------===*/
+
+#ifdef CG_DATA_SECT_ENTRY
+#define CG_DATA_DEFINED
+CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON),
+                   CG_DATA_OUTLINE_COFF, "__DATA,")
+
+#undef CG_DATA_SECT_ENTRY
+#endif
+
+/* section name strings common to all targets other
+   than WIN32 */
+#define CG_DATA_OUTLINE_COMMON __llvm_outline
+/* Since cg data sections are not allocated, we don't need to
+ * access them at runtime.
+ */
+#define CG_DATA_OUTLINE_COFF ".loutline"
+
+#ifdef _WIN32
+/* Runtime section names and name strings.  */
+#define CG_DATA_SECT_NAME CG_DATA_OUTLINE_COFF
+
+#else
+/* Runtime section names and name strings.  */
+#define CG_DATA_SECT_NAME INSTR_PROF_QUOTE(CG_DATA_OUTLINE_COMMON)
+
+#endif
+
+/* Indexed codegen data format version (start from 1). */
+#define CG_DATA_INDEX_VERSION 1
+
+/* Helper macros.  */
+#define CG_DATA_SIMPLE_QUOTE(x) #x
+#define CG_DATA_QUOTE(x) CG_DATA_SIMPLE_QUOTE(x)
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
new file mode 100644
index 00000000000000..df4ae3ed24e79a
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
@@ -0,0 +1,154 @@
+//===- CodeGenDataReader.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATAREADER_H
+#define LLVM_CODEGENDATA_CODEGENDATAREADER_H
+
+#include "llvm/CodeGenData/CodeGenData.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/VirtualFileSystem.h"
+
+namespace llvm {
+
+class CodeGenDataReader {
+  cgdata_error LastError = cgdata_error::success;
+  std::string LastErrorMsg;
+
+public:
+  CodeGenDataReader() = default;
+  virtual ~CodeGenDataReader() = default;
+
+  /// Read the header.  Required before reading first record.
+  virtual Error read() = 0;
+  /// Return the codegen data version.
+  virtual uint32_t getVersion() const = 0;
+  /// Return the codegen data kind.
+  virtual CGDataKind getDataKind() const = 0;
+  /// Return true if the data has an outlined hash tree.
+  virtual bool hasOutlinedHashTree() const = 0;
+  /// Return the outlined hash tree that is released from the reader.
+  std::unique_ptr<OutlinedHashTree> releaseOutlinedHashTree() {
+    return std::move(HashTreeRecord.HashTree);
+  }
+
+  /// Factory method to create an appropriately typed reader for the given
+  /// codegen data file path and file system.
+  static Expected<std::unique_ptr<CodeGenDataReader>>
+  create(const Twine &Path, vfs::FileSystem &FS);
+
+  /// Factory method to create an appropriately typed reader for the given
+  /// memory buffer.
+  static Expected<std::unique_ptr<CodeGenDataReader>>
+  create(std::unique_ptr<MemoryBuffer> Buffer);
+
+  /// Extract the cgdata embedded in sections from the given object file and
+  /// merge them into the GlobalOutlineRecord. This is a static helper that
+  /// is used by `llvm-cgdata merge` or ThinLTO's two-codegen rounds.
+  static Error mergeFromObjectFile(const object::ObjectFile *Obj,
+                                   OutlinedHashTreeRecord &GlobalOutlineRecord);
+
+protected:
+  /// The outlined hash tree that has been read. When it's released by
+  /// releaseOutlinedHashTree(), it's no longer valid.
+  OutlinedHashTreeRecord HashTreeRecord;
+
+  /// Set the current error and return same.
+  Error error(cgdata_error Err, const std::string &ErrMsg = "") {
+    LastError = Err;
+    LastErrorMsg = ErrMsg;
+    if (Err == cgdata_error::success)
+      return Error::success();
+    return make_error<CGDataError>(Err, ErrMsg);
+  }
+
+  Error error(Error &&E) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      LastError = IPE.get();
+      LastErrorMsg = IPE.getMessage();
+    });
+    return make_error<CGDataError>(LastError, LastErrorMsg);
+  }
+
+  /// Clear the current error and return a successful one.
+  Error success() { return error(cgdata_error::success); }
+};
+
+class IndexedCodeGenDataReader : public CodeGenDataReader {
+  /// The codegen data file contents.
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// The header
+  IndexedCGData::Header Header;
+
+public:
+  IndexedCodeGenDataReader(std::unique_ptr<MemoryBuffer> DataBuffer)
+      : DataBuffer(std::move(DataBuffer)) {}
+  IndexedCodeGenDataReader(const IndexedCodeGenDataReader &) = delete;
+  IndexedCodeGenDataReader &
+  operator=(const IndexedCodeGenDataReader &) = delete;
+
+  /// Return true if the given buffer is in binary codegen data format.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+  /// Read the contents including the header.
+  Error read() override;
+  /// Return the codegen data version.
+  uint32_t getVersion() const override { return Header.Version; }
+  /// Return the codegen data kind.
+  CGDataKind getDataKind() const override {
+    return static_cast<CGDataKind>(Header.DataKind);
+  }
+  /// Return true if the header indicates the data has an outlined hash tree.
+  /// This does not mean that the data is still available.
+  bool hasOutlinedHashTree() const override {
+    return Header.DataKind &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+};
+
+/// This format is a simple text format that's suitable for test data.
+/// The header is a custom format starting with `:` per line to indicate which
+/// codegen data is recorded. `#` is used to indicate a comment.
+/// The subsequent data is a YAML format per each codegen data in order.
+/// Currently, it only has a function outlined hash tree.
+class TextCodeGenDataReader : public CodeGenDataReader {
+  /// The codegen data file contents.
+  std::unique_ptr<MemoryBuffer> DataBuffer;
+  /// Iterator over the profile data.
+  line_iterator Line;
+  /// Describe the kind of the codegen data.
+  CGDataKind DataKind = CGDataKind::Unknown;
+
+public:
+  TextCodeGenDataReader(std::unique_ptr<MemoryBuffer> DataBuffer_)
+      : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#') {}
+  TextCodeGenDataReader(const TextCodeGenDataReader &) = delete;
+  TextCodeGenDataReader &operator=(const TextCodeGenDataReader &) = delete;
+
+  /// Return true if the given buffer is in text codegen data format.
+  static bool hasFormat(const MemoryBuffer &Buffer);
+  /// Read the contents including the header.
+  Error read() override;
+  /// Text format does not have version, so return 0.
+  uint32_t getVersion() const override { return 0; }
+  /// Return the codegen data kind.
+  CGDataKind getDataKind() const override { return DataKind; }
+  /// Return true if the header indicates the data has an outlined hash tree.
+  /// This does not mean that the data is still available.
+  bool hasOutlinedHashTree() const override {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_CODEGENDATAREADER_H
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
new file mode 100644
index 00000000000000..e17ffc3482ec91
--- /dev/null
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
@@ -0,0 +1,68 @@
+//===- CodeGenDataWriter.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+#define LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+
+#include "llvm/CodeGenData/CodeGenData.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+
+class CGDataOStream;
+
+class CodeGenDataWriter {
+  /// The outlined hash tree to be written.
+  OutlinedHashTreeRecord HashTreeRecord;
+
+  /// A bit mask describing the kind of the codegen data.
+  CGDataKind DataKind = CGDataKind::Unknown;
+
+public:
+  CodeGenDataWriter() = default;
+  ~CodeGenDataWriter() = default;
+
+  /// Add the outlined hash tree record. The input Record is released.
+  void addRecord(OutlinedHashTreeRecord &Record);
+
+  /// Write the codegen data to \c OS
+  Error write(raw_fd_ostream &OS);
+
+  /// Write the codegen data in text format to \c OS
+  Error writeText(raw_fd_ostream &OS);
+
+  /// Return the attributes of the current CGData.
+  CGDataKind getCGDataKind() const { return DataKind; }
+
+  /// Return true if the header indicates the data has an outlined hash tree.
+  bool hasOutlinedHashTree() const {
+    return static_cast<uint32_t>(DataKind) &
+           static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+  }
+
+private:
+  /// The offset of the outlined hash tree in the file.
+  uint64_t OutlinedHashTreeOffset;
+
+  /// Write the codegen data header to \c COS
+  Error writeHeader(CGDataOStream &COS);
+
+  /// Write the codegen data header in text to \c OS
+  Error writeHeaderText(raw_fd_ostream &OS);
+
+  Error writeImpl(CGDataOStream &COS);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGENDATA_CODEGENDATAWRITER_H
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CodeGenData/CMakeLists.txt
index f9d107f52a7153..0a231d6214fea1 100644
--- a/llvm/lib/CodeGenData/CMakeLists.txt
+++ b/llvm/lib/CodeGenData/CMakeLists.txt
@@ -1,4 +1,7 @@
 add_llvm_component_library(LLVMCodeGenData
+  CodeGenData.cpp
+  CodeGenDataReader.cpp
+  CodeGenDataWriter.cpp
   OutlinedHashTree.cpp
   OutlinedHashTreeRecord.cpp
 
diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp
new file mode 100644
index 00000000000000..3bd21c97c7de7a
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenData.cpp
@@ -0,0 +1,197 @@
+//===-- CodeGenData.cpp ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for codegen data that has stable summary which
+// can be used to optimize the code in the subsequent codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/WithColor.h"
+
+#define DEBUG_TYPE "cg-data"
+
+using namespace llvm;
+using namespace cgdata;
+
+static std::string getCGDataErrString(cgdata_error Err,
+                                      const std::string &ErrMsg = "") {
+  std::string Msg;
+  raw_string_ostream OS(Msg);
+
+  switch (Err) {
+  case cgdata_error::success:
+    OS << "success";
+    break;
+  case cgdata_error::eof:
+    OS << "end of File";
+    break;
+  case cgdata_error::bad_magic:
+    OS << "invalid codegen data (bad magic)";
+    break;
+  case cgdata_error::bad_header:
+    OS << "invalid codegen data (file header is corrupt)";
+    break;
+  case cgdata_error::empty_cgdata:
+    OS << "empty codegen data";
+    break;
+  case cgdata_error::malformed:
+    OS << "malformed codegen data";
+    break;
+  case cgdata_error::unsupported_version:
+    OS << "unsupported codegen data version";
+    break;
+  }
+
+  // If optional error message is not empty, append it to the message.
+  if (!ErrMsg.empty())
+    OS << ": " << ErrMsg;
+
+  return OS.str();
+}
+
+namespace {
+
+// FIXME: This class is only here to support the transition to llvm::Error. It
+// will be removed once this transition is complete. Clients should prefer to
+// deal with the Error value directly, rather than converting to error_code.
+class CGDataErrorCategoryType : public std::error_category {
+  const char *name() const noexcept override { return "llvm.cgdata"; }
+
+  std::string message(int IE) const override {
+    return getCGDataErrString(static_cast<cgdata_error>(IE));
+  }
+};
+
+} // end anonymous namespace
+
+const std::error_category &llvm::cgdata_category() {
+  static CGDataErrorCategoryType ErrorCategory;
+  return ErrorCategory;
+}
+
+std::string CGDataError::message() const {
+  return getCGDataErrString(Err, Msg);
+}
+
+char CGDataError::ID = 0;
+
+namespace {
+
+const char *CodeGenDataSectNameCommon[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
+  SectNameCommon,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+const char *CodeGenDataSectNameCoff[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
+  SectNameCoff,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+const char *CodeGenDataSectNamePrefix[] = {
+#define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Prefix,
+#include "llvm/CodeGenData/CodeGenData.inc"
+};
+
+} // namespace
+
+namespace llvm {
+
+std::string getCodeGenDataSectionName(CGDataSectKind CGSK,
+                                      Triple::ObjectFormatType OF,
+                                      bool AddSegmentInfo) {
+  std::string SectName;
+
+  if (OF == Triple::MachO && AddSegmentInfo)
+    SectName = CodeGenDataSectNamePrefix[CGSK];
+
+  if (OF == Triple::COFF)
+    SectName += CodeGenDataSectNameCoff[CGSK];
+  else
+    SectName += CodeGenDataSectNameCommon[CGSK];
+
+  return SectName;
+}
+
+std::unique_ptr<CodeGenData> CodeGenData::Instance = nullptr;
+std::once_flag CodeGenData::OnceFlag;
+
+CodeGenData &CodeGenData::getInstance() {
+  std::call_once(CodeGenData::OnceFlag, []() {
+    auto *CGD = new CodeGenData();
+    Instance.reset(CGD);
+
+    // TODO: Initialize writer or reader mode for the client optimization.
+  });
+  return *(Instance.get());
+}
+
+namespace IndexedCGData {
+
+Expected<Header> Header::readFromBuffer(const unsigned char *Curr) {
+  using namespace support;
+
+  static_assert(std::is_standard_layout_v<llvm::IndexedCGData::Header>,
+                "The header should be standard layout type since we use offset "
+                "of fields to read.");
+  Header H;
+  H.Magic = endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
+  if (H.Magic != IndexedCGData::Magic)
+    return make_error<CGDataError>(cgdata_error::bad_magic);
+  H.Version = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
+  if (H.Version > IndexedCGData::CGDataVersion::CurrentVersion)
+    return make_error<CGDataError>(cgdata_error::unsupported_version);
+  H.DataKind = endian::readNext<uint32_t, endianness::little, unaligned>(Curr);
+
+  switch (H.Version) {
+    // When a new field is added to the header add a case statement here to
+    // compute the size as offset of the new field + size of the new field. This
+    // relies on the field being added to the end of the list.
+    static_assert(IndexedCGData::CGDataVersion::CurrentVersion == Version1,
+                  "Please update the size computation below if a new field has "
+                  "been added to the header, if not add a case statement to "
+                  "fall through to the latest version.");
+  case 1ull:
+    H.OutlinedHashTreeOffset =
+        endian::readNext<uint64_t, endianness::little, unaligned>(Curr);
+  }
+
+  return H;
+}
+
+} // end namespace IndexedCGData
+
+namespace cgdata {
+
+void warn(Twine Message, std::string Whence, std::string Hint) {
+  WithColor::warning();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+}
+
+void warn(Error E, StringRef Whence) {
+  if (E.isA<CGDataError>()) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      warn(IPE.message(), std::string(Whence), std::string(""));
+    });
+  }
+}
+
+} // end namespace cgdata
+
+} // end namespace llvm
diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
new file mode 100644
index 00000000000000..1b08085dec2f25
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
@@ -0,0 +1,174 @@
+//===- CodeGenDataReader.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for reading codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+#define DEBUG_TYPE "cg-data-reader"
+
+using namespace llvm;
+
+namespace llvm {
+
+static Expected<std::unique_ptr<MemoryBuffer>>
+setupMemoryBuffer(const Twine &Filename, vfs::FileSystem &FS) {
+  auto BufferOrErr = Filename.str() == "-" ? MemoryBuffer::getSTDIN()
+                                           : FS.getBufferForFile(Filename);
+  if (std::error_code EC = BufferOrErr.getError())
+    return errorCodeToError(EC);
+  return std::move(BufferOrErr.get());
+}
+
+Error CodeGenDataReader::mergeFromObjectFile(
+    const object::ObjectFile *Obj,
+    OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  Triple TT = Obj->makeTriple();
+  auto CGOutLineName =
+      getCodeGenDataSectionName(CG_outline, TT.getObjectFormat(), false);
+
+  for (auto &Section : Obj->sections()) {
+    Expected<StringRef> NameOrErr = Section.getName();
+    if (!NameOrErr)
+      return NameOrErr.takeError();
+    Expected<StringRef> ContentsOrErr = Section.getContents();
+    if (!ContentsOrErr)
+      return ContentsOrErr.takeError();
+    auto *Data = reinterpret_cast<const unsigned char *>(ContentsOrErr->data());
+    auto *EndData = Data + ContentsOrErr->size();
+
+    if (*NameOrErr == CGOutLineName) {
+      // In case dealing with an executable that has concatenaed cgdata,
+      // we want to merge them into a single cgdata.
+      // Although it's not a typical workflow, we support this scenario.
+      while (Data != EndData) {
+        OutlinedHashTreeRecord LocalOutlineRecord;
+        LocalOutlineRecord.deserialize(Data);
+        GlobalOutlineRecord.merge(LocalOutlineRecord);
+      }
+    }
+    // TODO: Add support for other cgdata sections.
+  }
+
+  return Error::success();
+}
+
+Error IndexedCodeGenDataReader::read() {
+  using namespace support;
+
+  // The smallest header with the version 1 is 24 bytes
+  const unsigned MinHeaderSize = 24;
+  if (DataBuffer->getBufferSize() < MinHeaderSize)
+    return error(cgdata_error::bad_header);
+
+  auto *Start =
+      reinterpret_cast<const unsigned char *>(DataBuffer->getBufferStart());
+  auto *End =
+      reinterpret_cast<const unsigned char *>(DataBuffer->getBufferEnd());
+  auto HeaderOr = IndexedCGData::Header::readFromBuffer(Start);
+  if (!HeaderOr)
+    return HeaderOr.takeError();
+  Header = HeaderOr.get();
+
+  if (hasOutlinedHashTree()) {
+    const unsigned char *Ptr = Start + Header.OutlinedHashTreeOffset;
+    if (Ptr >= End)
+      return error(cgdata_error::eof);
+    HashTreeRecord.deserialize(Ptr);
+  }
+
+  return success();
+}
+
+Expected<std::unique_ptr<CodeGenDataReader>>
+CodeGenDataReader::create(const Twine &Path, vfs::FileSystem &FS) {
+  // Set up the buffer to read.
+  auto BufferOrError = setupMemoryBuffer(Path, FS);
+  if (Error E = BufferOrError.takeError())
+    return std::move(E);
+  return CodeGenDataReader::create(std::move(BufferOrError.get()));
+}
+
+Expected<std::unique_ptr<CodeGenDataReader>>
+CodeGenDataReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
+  if (Buffer->getBufferSize() == 0)
+    return make_error<CGDataError>(cgdata_error::empty_cgdata);
+
+  std::unique_ptr<CodeGenDataReader> Reader;
+  // Create the reader.
+  if (IndexedCodeGenDataReader::hasFormat(*Buffer))
+    Reader.reset(new IndexedCodeGenDataReader(std::move(Buffer)));
+  else if (TextCodeGenDataReader::hasFormat(*Buffer))
+    Reader.reset(new TextCodeGenDataReader(std::move(Buffer)));
+  else
+    return make_error<CGDataError>(cgdata_error::malformed);
+
+  // Initialize the reader and return the result.
+  if (Error E = Reader->read())
+    return std::move(E);
+
+  return std::move(Reader);
+}
+
+bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) {
+  using namespace support;
+  if (DataBuffer.getBufferSize() < 8)
+    return false;
+
+  uint64_t Magic = endian::read<uint64_t, llvm::endianness::little, aligned>(
+      DataBuffer.getBufferStart());
+  // Verify that it's magical.
+  return Magic == IndexedCGData::Magic;
+}
+
+bool TextCodeGenDataReader::hasFormat(const MemoryBuffer &Buffer) {
+  // Verify that this really looks like plain ASCII text by checking a
+  // 'reasonable' number of characters (up to profile magic size).
+  size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t));
+  StringRef buffer = Buffer.getBufferStart();
+  return count == 0 ||
+         std::all_of(buffer.begin(), buffer.begin() + count,
+                     [](char c) { return isPrint(c) || isSpace(c); });
+}
+Error TextCodeGenDataReader::read() {
+  using namespace support;
+
+  // Parse the custom header line by line.
+  while (Line->starts_with(":")) {
+    StringRef Str = Line->substr(1);
+    if (Str.equals_insensitive("outlined_hash_tree"))
+      DataKind |= CGDataKind::FunctionOutlinedHashTree;
+    else
+      return error(cgdata_error::bad_header);
+    ++Line;
+  }
+
+  // We treat an empty header (that as a comment # only) as a valid header.
+  if (Line.is_at_eof()) {
+    if (DataKind != CGDataKind::Unknown)
+      return error(cgdata_error::bad_header);
+    return Error::success();
+  }
+
+  // The YAML docs follow after the header.
+  const char *Pos = (*Line).data();
+  size_t Size = reinterpret_cast<size_t>(DataBuffer->getBufferEnd()) -
+                reinterpret_cast<size_t>(Pos);
+  yaml::Input YOS(StringRef(Pos, Size));
+  if (hasOutlinedHashTree())
+    HashTreeRecord.deserializeYAML(YOS);
+
+  // TODO: Add more yaml cgdata in order
+
+  return Error::success();
+}
+} // end namespace llvm
diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
new file mode 100644
index 00000000000000..9aa0d86223f714
--- /dev/null
+++ b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
@@ -0,0 +1,162 @@
+//===- CodeGenDataWriter.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing codegen data.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
+
+#define DEBUG_TYPE "cg-data-writer"
+
+using namespace llvm;
+
+namespace llvm {
+
+/// A struct to define how the data stream should be patched.
+struct CGDataPatchItem {
+  uint64_t Pos; // Where to patch.
+  uint64_t *D;  // Pointer to an array of source data.
+  int N;        // Number of elements in \c D array.
+};
+
+// A wrapper class to abstract writer stream with support of bytes
+// back patching.
+class CGDataOStream {
+public:
+  CGDataOStream(raw_fd_ostream &FD)
+      : IsFDOStream(true), OS(FD), LE(FD, llvm::endianness::little) {}
+  CGDataOStream(raw_string_ostream &STR)
+      : IsFDOStream(false), OS(STR), LE(STR, llvm::endianness::little) {}
+
+  uint64_t tell() { return OS.tell(); }
+  void write(uint64_t V) { LE.write<uint64_t>(V); }
+  void write32(uint32_t V) { LE.write<uint32_t>(V); }
+  void write8(uint8_t V) { LE.write<uint8_t>(V); }
+
+  // \c patch can only be called when all data is written and flushed.
+  // For raw_string_ostream, the patch is done on the target string
+  // directly and it won't be reflected in the stream's internal buffer.
+  void patch(ArrayRef<CGDataPatchItem> P) {
+    using namespace support;
+
+    if (IsFDOStream) {
+      raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
+      const uint64_t LastPos = FDOStream.tell();
+      for (const auto &K : P) {
+        FDOStream.seek(K.Pos);
+        for (int I = 0; I < K.N; I++)
+          write(K.D[I]);
+      }
+      // Reset the stream to the last position after patching so that users
+      // don't accidentally overwrite data. This makes it consistent with
+      // the string stream below which replaces the data directly.
+      FDOStream.seek(LastPos);
+    } else {
+      raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
+      std::string &Data = SOStream.str(); // with flush
+      for (const auto &K : P) {
+        for (int I = 0; I < K.N; I++) {
+          uint64_t Bytes =
+              endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+          Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
+                       (const char *)&Bytes, sizeof(uint64_t));
+        }
+      }
+    }
+  }
+
+  // If \c OS is an instance of \c raw_fd_ostream, this field will be
+  // true. Otherwise, \c OS will be an raw_string_ostream.
+  bool IsFDOStream;
+  raw_ostream &OS;
+  support::endian::Writer LE;
+};
+
+} // end namespace llvm
+
+void CodeGenDataWriter::addRecord(OutlinedHashTreeRecord &Record) {
+  assert(Record.HashTree && "empty hash tree in the record");
+  HashTreeRecord.HashTree = std::move(Record.HashTree);
+
+  DataKind |= CGDataKind::FunctionOutlinedHashTree;
+}
+
+Error CodeGenDataWriter::write(raw_fd_ostream &OS) {
+  CGDataOStream COS(OS);
+  return writeImpl(COS);
+}
+
+Error CodeGenDataWriter::writeHeader(CGDataOStream &COS) {
+  using namespace support;
+  IndexedCGData::Header Header;
+  Header.Magic = IndexedCGData::Magic;
+  Header.Version = IndexedCGData::Version;
+
+  // Set the CGDataKind depending on the kind.
+  Header.DataKind = 0;
+  if (static_cast<bool>(DataKind & CGDataKind::FunctionOutlinedHashTree))
+    Header.DataKind |=
+        static_cast<uint32_t>(CGDataKind::FunctionOutlinedHashTree);
+
+  Header.OutlinedHashTreeOffset = 0;
+
+  // Only write out up to the CGDataKind. We need to remember the offest of the
+  // remaing fields to allow back patching later.
+  COS.write(Header.Magic);
+  COS.write32(Header.Version);
+  COS.write32(Header.DataKind);
+
+  // Save the location of Header.OutlinedHashTreeOffset field in \c COS.
+  OutlinedHashTreeOffset = COS.tell();
+
+  // Reserve the space for OutlinedHashTreeOffset field.
+  COS.write(0);
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeImpl(CGDataOStream &COS) {
+  if (Error E = writeHeader(COS))
+    return E;
+
+  uint64_t OutlinedHashTreeFieldStart = COS.tell();
+  if (hasOutlinedHashTree())
+    HashTreeRecord.serialize(COS.OS);
+
+  // Back patch the offsets.
+  CGDataPatchItem PatchItems[] = {
+      {OutlinedHashTreeOffset, &OutlinedHashTreeFieldStart, 1}};
+  COS.patch(PatchItems);
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeHeaderText(raw_fd_ostream &OS) {
+  if (hasOutlinedHashTree())
+    OS << "# Outlined stable hash tree\n:outlined_hash_tree\n";
+
+  // TODO: Add more data types in this header
+
+  return Error::success();
+}
+
+Error CodeGenDataWriter::writeText(raw_fd_ostream &OS) {
+  if (Error E = writeHeaderText(OS))
+    return E;
+
+  yaml::Output YOS(OS);
+  if (hasOutlinedHashTree())
+    HashTreeRecord.serializeYAML(YOS);
+
+  // TODO: Write more yaml cgdata in order
+
+  return Error::success();
+}
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 8abc1533362512..0f449fa2d45be9 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -74,6 +74,7 @@ set(LLVM_TEST_DEPENDS
           llvm-c-test
           llvm-cat
           llvm-cfi-verify
+          llvm-cgdata
           llvm-config
           llvm-cov
           llvm-ctxprof-util
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index e5e3dc7e1b4bd0..bee7aa3903a1f5 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -180,6 +180,7 @@ def get_asan_rtlib():
         "llvm-addr2line",
         "llvm-bcanalyzer",
         "llvm-bitcode-strip",
+        "llvm-cgdata",
         "llvm-config",
         "llvm-cov",
         "llvm-ctxprof-util",
diff --git a/llvm/test/tools/llvm-cgdata/dump.test b/llvm/test/tools/llvm-cgdata/dump.test
new file mode 100644
index 00000000000000..ce2ad27a5ff81c
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/dump.test
@@ -0,0 +1,30 @@
+# Test dump between the binary and text formats.
+
+RUN: split-file %s %t
+
+RUN: llvm-cgdata dump -binary %t/dump.cgtext -o %t/dump.cgdata
+RUN: llvm-cgdata dump -text %t/dump.cgdata -o %t/dump-round.cgtext
+RUN: llvm-cgdata dump -binary %t/dump-round.cgtext -o %t/dump-round.cgdata
+RUN: diff %t/dump.cgdata %t/dump-round.cgdata
+
+;--- dump.cgtext
+# Outlined stable hash tree
+:outlined_hash_tree
+---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2, 3 ]
+2:
+  Hash:            0x3
+  Terminals:       5
+  SuccessorIds:    [  ]
+3:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
new file mode 100644
index 00000000000000..d5e201b9eec17f
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -0,0 +1,32 @@
+# Test for empty cgdata file, which is invalid.
+RUN: touch %t_emptyfile.cgtext
+RUN: not llvm-cgdata dump %t_emptyfile.cgtext -text -o - 2>&1 | FileCheck %s --check-prefix ERROR
+ERROR: {{.}}emptyfile.cgtext: empty codegen data
+
+# Test for empty header in the text format. It can be converted to a valid binary file.
+RUN: printf '#' > %t_emptyheader.cgtext
+RUN: llvm-cgdata dump %t_emptyheader.cgtext -binary -o %t_emptyheader.cgdata
+
+# Without any cgdata other than the header, no data shows by default.
+RUN: llvm-cgdata show %t_emptyheader.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
+EMPTY-NOT: any
+
+# The version number appears when asked, as it's in the header
+RUN: llvm-cgdata show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix VERSION
+VERSION: Version: {{.}}
+
+# When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
+RUN: llvm-cgdata dump %t_emptyheader.cgdata -text -o - | FileCheck %s --allow-empty --check-prefix EMPTY
+
+# Synthesize a header only cgdata.
+# struct Header {
+#   uint64_t Magic;
+#   uint32_t Version;
+#   uint32_t DataKind;
+#   uint64_t OutlinedHashTreeOffset;
+# }
+RUN: printf '\xffcgdata\x81' > %t_header.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_header.cgdata
+RUN: diff %t_header.cgdata %t_emptyheader.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
new file mode 100644
index 00000000000000..5e1b14de5e509d
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -0,0 +1,38 @@
+# Test various error cases
+
+# Synthesize a header only cgdata.
+# struct Header {
+#   uint64_t Magic;
+#   uint32_t Version;
+#   uint32_t DataKind;
+#   uint64_t OutlinedHashTreeOffset;
+# }
+RUN: touch %t_empty.cgdata
+RUN: not llvm-cgdata show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix EMPTY
+EMPTY: {{.}}cgdata: empty codegen data
+
+# Not a magic.
+RUN: printf '\xff' > %t_malformed.cgdata
+RUN: not llvm-cgdata show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix MALFORMED
+MALFORMED: {{.}}cgdata: malformed codegen data
+
+# The minimum header size is 24.
+RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
+RUN: not llvm-cgdata show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix CORRUPT
+CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
+
+# The current version 1 while the header says 2.
+RUN: printf '\xffcgdata\x81' > %t_version.cgdata
+RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
+RUN: not llvm-cgdata show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix BAD_VERSION
+BAD_VERSION: {{.}}cgdata: unsupported codegen data version
+
+# Header says an outlined hash tree, but the file ends after the header.
+RUN: printf '\xffcgdata\x81' > %t_eof.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
+RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
+RUN: not llvm-cgdata show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix EOF
+EOF: {{.}}cgdata: end of File
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
new file mode 100644
index 00000000000000..a27d6c2a16f4ab
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -0,0 +1,75 @@
+# Merge an archive that has two object files having cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
+RUN: llvm-cgdata merge %t/merge-archive.a -o %t/merge-archive.cgdata
+RUN: llvm-cgdata show %t/merge-archive.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge-archive.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-1.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
+;--- merge-2.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
new file mode 100644
index 00000000000000..3411133cb7aacb
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -0,0 +1,68 @@
+# Merge a binary file (e.g., a linked executable) having concatnated cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
+RUN: llvm-cgdata merge %t/merge-concat.o -o %t/merge-concat.cgdata
+RUN: llvm-cgdata show %t/merge-concat.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge-concat.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-concat.ll
+
+; In an linked executable (as opposed to an object file), cgdata in __llvm_outline might be concatenated. Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated. In other word, the following two trees are encoded back-to-back in a binary format.
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data1 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+ at .data2 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
new file mode 100644
index 00000000000000..6ce358cd72325b
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -0,0 +1,74 @@
+# Merge two object files having cgdata (__llvm_outline)
+
+RUN: split-file %s %t
+
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+RUN: llvm-cgdata show %t/merge.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 4
+CHECK-NEXT:  Terminal Node Count: 2
+CHECK-NEXT:  Depth: 2
+
+RUN: llvm-cgdata dump %t/merge.cgdata | FileCheck %s --check-prefix TREE
+TREE: # Outlined stable hash tree
+TREE-NEXT: :outlined_hash_tree
+TREE-NEXT: ---
+TREE-NEXT: 0:
+TREE-NEXT:   Hash:            0x0
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 1 ]
+TREE-NEXT: 1:
+TREE-NEXT:   Hash:            0x1
+TREE-NEXT:   Terminals:       0
+TREE-NEXT:   SuccessorIds:    [ 2, 3 ]
+TREE-NEXT: 2:
+TREE-NEXT:   Hash:            0x3
+TREE-NEXT:   Terminals:       5
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: 3:
+TREE-NEXT:   Hash:            0x2
+TREE-NEXT:   Terminals:       4
+TREE-NEXT:   SuccessorIds:    [  ]
+TREE-NEXT: ...
+
+;--- merge-1.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
+;--- merge-2.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x3
+;  Terminals:       5
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
new file mode 100644
index 00000000000000..73bdd9800dbe1d
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -0,0 +1,43 @@
+# Test merge a single object file into a cgdata
+
+RUN: split-file %s %t
+
+# Merge an object file that has no cgdata (__llvm_outline). It still produces a header only cgdata.
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-empty.ll -o %t/merge-empty.o
+RUN: llvm-cgdata merge %t/merge-empty.o -o %t/merge-empty.cgdata
+RUN: llvm-cgdata show %t/merge-empty.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
+EMPTY-NOT: any
+
+
+# Merge an object file having cgdata (__llvm_outline)
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
+RUN: llvm-cgdata merge %t/merge-single.o -o %t/merge-single.cgdata
+RUN: llvm-cgdata show %t/merge-single.cgdata | FileCheck %s
+CHECK: Outlined hash tree:
+CHECK-NEXT:  Total Node Count: 3
+CHECK-NEXT:  Terminal Node Count: 1
+CHECK-NEXT:  Depth: 2
+
+;--- merge-empty.ll
+ at .data = private unnamed_addr constant [1 x i8] c"\01"
+
+;--- merge-single.ll
+
+; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
+;---
+;0:
+;  Hash:            0x0
+;  Terminals:       0
+;  SuccessorIds:    [ 1 ]
+;1:
+;  Hash:            0x1
+;  Terminals:       0
+;  SuccessorIds:    [ 2 ]
+;2:
+;  Hash:            0x2
+;  Terminals:       4
+;  SuccessorIds:    [  ]
+;...
+
+ at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+
diff --git a/llvm/test/tools/llvm-cgdata/show.test b/llvm/test/tools/llvm-cgdata/show.test
new file mode 100644
index 00000000000000..accb4b77ede246
--- /dev/null
+++ b/llvm/test/tools/llvm-cgdata/show.test
@@ -0,0 +1,30 @@
+# Test show
+
+RUN: split-file %s %t
+RUN: llvm-cgdata show %t/show.cgtext | FileCheck %s
+
+CHECK: Outlined hash tree:
+CHECK-NEXT:   Total Node Count: 3
+CHECK-NEXT:   Terminal Node Count: 1
+CHECK-NEXT:   Depth: 2
+
+# Convert the text file to the binary file
+RUN: llvm-cgdata dump -binary %t/show.cgtext -o %t/show.cgdata
+RUN: llvm-cgdata show %t/show.cgdata | FileCheck %s
+
+;--- show.cgtext
+:outlined_hash_tree
+---
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       3
+  SuccessorIds:    [  ]
+...
diff --git a/llvm/tools/llvm-cgdata/CMakeLists.txt b/llvm/tools/llvm-cgdata/CMakeLists.txt
new file mode 100644
index 00000000000000..4f1f7ff635bc3c
--- /dev/null
+++ b/llvm/tools/llvm-cgdata/CMakeLists.txt
@@ -0,0 +1,15 @@
+set(LLVM_LINK_COMPONENTS
+  CodeGen
+  CodeGenData
+  Core
+  Object
+  Support
+  )
+
+add_llvm_tool(llvm-cgdata
+  llvm-cgdata.cpp
+
+  DEPENDS
+  intrinsics_gen
+  GENERATE_DRIVER
+  )
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
new file mode 100644
index 00000000000000..195f066fd6b872
--- /dev/null
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -0,0 +1,268 @@
+//===-- llvm-cgdata.cpp - LLVM CodeGen Data Tool --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// llvm-cgdata parses raw codegen data embedded in compiled binary files, and
+// merges them into a single .cgdata file. It can also inspect and maninuplate
+// a .cgdata file. This .cgdata can contain various codegen data like outlining
+// information, and it can be used to optimize the code in the subsequent build.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGenData/CodeGenDataReader.h"
+#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Object/Archive.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/LLVMDriver.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+// TODO: https://llvm.org/docs/CommandGuide/llvm-cgdata.html has documentations
+// on each subcommand.
+cl::SubCommand DumpSubcommand(
+    "dump",
+    "Dump the (indexed) codegen data file in either text or binary format.");
+cl::SubCommand MergeSubcommand(
+    "merge", "Takes binary files having raw codegen data in custom sections, "
+             "and merge them into an index codegen data file.");
+cl::SubCommand
+    ShowSubcommand("show", "Show summary of the (indexed) codegen data file.");
+
+enum CGDataFormat {
+  CD_None = 0,
+  CD_Text,
+  CD_Binary,
+};
+
+cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                    cl::init("-"), cl::desc("Output file"),
+                                    cl::sub(DumpSubcommand),
+                                    cl::sub(MergeSubcommand));
+cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                          cl::aliasopt(OutputFilename));
+
+cl::opt<std::string> Filename(cl::Positional, cl::desc("<cgdata-file>"),
+                              cl::sub(DumpSubcommand), cl::sub(ShowSubcommand));
+cl::list<std::string> InputFilenames(cl::Positional, cl::sub(MergeSubcommand),
+                                     cl::desc("<binary-files...>"));
+cl::opt<CGDataFormat> OutputFormat(
+    cl::desc("Format of output data"), cl::sub(DumpSubcommand),
+    cl::init(CD_Text),
+    cl::values(clEnumValN(CD_Text, "text", "Text encoding"),
+               clEnumValN(CD_Binary, "binary", "Binary encoding")));
+
+cl::opt<bool> ShowCGDataVersion("cgdata-version", cl::init(false),
+                                cl::desc("Show cgdata version. "),
+                                cl::sub(ShowSubcommand));
+
+static void exitWithError(Twine Message, std::string Whence = "",
+                          std::string Hint = "") {
+  WithColor::error();
+  if (!Whence.empty())
+    errs() << Whence << ": ";
+  errs() << Message << "\n";
+  if (!Hint.empty())
+    WithColor::note() << Hint << "\n";
+  ::exit(1);
+}
+
+static void exitWithError(Error E, StringRef Whence = "") {
+  if (E.isA<CGDataError>()) {
+    handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
+      exitWithError(IPE.message(), std::string(Whence));
+    });
+    return;
+  }
+
+  exitWithError(toString(std::move(E)), std::string(Whence));
+}
+
+static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") {
+  exitWithError(EC.message(), std::string(Whence));
+}
+
+static int dump_main(int argc, const char *argv[]) {
+  if (Filename == OutputFilename) {
+    errs() << sys::path::filename(argv[0]) << " " << argv[1]
+           << ": Input file name cannot be the same as the output file name!\n";
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC,
+                    OutputFormat == CD_Text ? sys::fs::OF_TextWithCRLF
+                                            : sys::fs::OF_None);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  CodeGenDataWriter Writer;
+  auto Reader = ReaderOrErr->get();
+  if (Reader->hasOutlinedHashTree()) {
+    OutlinedHashTreeRecord Record(Reader->releaseOutlinedHashTree());
+    Writer.addRecord(Record);
+  }
+
+  if (OutputFormat == CD_Text) {
+    if (Error E = Writer.writeText(OS))
+      exitWithError(std::move(E));
+  } else {
+    if (Error E = Writer.write(OS))
+      exitWithError(std::move(E));
+  }
+
+  return 0;
+}
+
+static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
+                         OutlinedHashTreeRecord &GlobalOutlineRecord);
+
+static bool handleArchive(StringRef Filename, Archive &Arch,
+                          OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  bool Result = true;
+  Error Err = Error::success();
+  for (const auto &Child : Arch.children(Err)) {
+    auto BuffOrErr = Child.getMemoryBufferRef();
+    if (Error E = BuffOrErr.takeError())
+      exitWithError(std::move(E), Filename);
+    auto NameOrErr = Child.getName();
+    if (Error E = NameOrErr.takeError())
+      exitWithError(std::move(E), Filename);
+    std::string Name = (Filename + "(" + NameOrErr.get() + ")").str();
+    Result &= handleBuffer(Name, BuffOrErr.get(), GlobalOutlineRecord);
+  }
+  if (Err)
+    exitWithError(std::move(Err), Filename);
+  return Result;
+}
+
+static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
+                         OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  Expected<std::unique_ptr<Binary>> BinOrErr = object::createBinary(Buffer);
+  if (Error E = BinOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  bool Result = true;
+  if (auto *Obj = dyn_cast<ObjectFile>(BinOrErr->get())) {
+    if (Error E =
+            CodeGenDataReader::mergeFromObjectFile(Obj, GlobalOutlineRecord))
+      exitWithError(std::move(E), Filename);
+  } else if (auto *Arch = dyn_cast<Archive>(BinOrErr->get())) {
+    Result &= handleArchive(Filename, *Arch, GlobalOutlineRecord);
+  } else {
+    // TODO: Support for the MachO universal binary format.
+    errs() << "Error: unsupported binary file: " << Filename << "\n";
+    Result = false;
+  }
+
+  return Result;
+}
+
+static bool handleFile(StringRef Filename,
+                       OutlinedHashTreeRecord &GlobalOutlineRecord) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> BuffOrErr =
+      MemoryBuffer::getFileOrSTDIN(Filename);
+  if (std::error_code EC = BuffOrErr.getError())
+    exitWithErrorCode(EC, Filename);
+  return handleBuffer(Filename, *BuffOrErr.get(), GlobalOutlineRecord);
+}
+
+static int merge_main(int argc, const char *argv[]) {
+  bool Result = true;
+  OutlinedHashTreeRecord GlobalOutlineRecord;
+  for (auto &Filename : InputFilenames)
+    Result &= handleFile(Filename, GlobalOutlineRecord);
+
+  if (!Result) {
+    errs() << "Error: failed to merge codegen data files.\n";
+    return 1;
+  }
+
+  CodeGenDataWriter Writer;
+  if (!GlobalOutlineRecord.empty())
+    Writer.addRecord(GlobalOutlineRecord);
+
+  std::error_code EC;
+  raw_fd_ostream Output(OutputFilename, EC, sys::fs::OF_None);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  if (auto E = Writer.write(Output))
+    exitWithError(std::move(E));
+
+  return 0;
+}
+
+static int show_main(int argc, const char *argv[]) {
+  if (Filename == OutputFilename) {
+    errs() << sys::path::filename(argv[0]) << " " << argv[1]
+           << ": Input file name cannot be the same as the output file name!\n";
+    return 1;
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = CodeGenDataReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = ReaderOrErr->get();
+  if (ShowCGDataVersion)
+    OS << "Version: " << Reader->getVersion() << "\n";
+
+  if (Reader->hasOutlinedHashTree()) {
+    auto Tree = Reader->releaseOutlinedHashTree();
+    OS << "Outlined hash tree:\n";
+    OS << "  Total Node Count: " << Tree->size() << "\n";
+    OS << "  Terminal Node Count: " << Tree->size(/*GetTerminalCountOnly=*/true)
+       << "\n";
+    OS << "  Depth: " << Tree->depth() << "\n";
+  }
+
+  return 0;
+}
+
+int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) {
+  const char **argv = const_cast<const char **>(argvNonConst);
+
+  StringRef ProgName(sys::path::filename(argv[0]));
+
+  if (argc < 2) {
+    errs() << ProgName
+           << ": No subcommand specified! Run llvm-cgdata --help for usage.\n";
+    return 1;
+  }
+
+  cl::ParseCommandLineOptions(argc, argv, "LLVM codegen data\n");
+
+  if (DumpSubcommand)
+    return dump_main(argc, argv);
+
+  if (MergeSubcommand)
+    return merge_main(argc, argv);
+
+  if (ShowSubcommand)
+    return show_main(argc, argv);
+
+  errs() << ProgName
+         << ": Unknown command. Run llvm-cgdata --help for usage.\n";
+  return 1;
+}

>From a7addac4e698acda1c8fa1cb4492ca8f7b8c6a0f Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 4 May 2024 22:19:34 -0700
Subject: [PATCH 02/12] Address comments from Ellis

---
 llvm/include/llvm/CodeGenData/CodeGenData.h   |  6 +-
 llvm/lib/CodeGenData/CodeGenDataReader.cpp    |  6 +-
 llvm/lib/CodeGenData/CodeGenDataWriter.cpp    |  4 +-
 llvm/test/tools/llvm-cgdata/dump.test         |  2 +
 llvm/test/tools/llvm-cgdata/empty.test        | 17 ++--
 llvm/test/tools/llvm-cgdata/error.test        | 10 +--
 .../test/tools/llvm-cgdata/merge-archive.test | 85 +++++++++++--------
 llvm/test/tools/llvm-cgdata/merge-concat.test | 81 +++++++++++-------
 llvm/test/tools/llvm-cgdata/merge-double.test | 85 +++++++++++--------
 llvm/test/tools/llvm-cgdata/merge-single.test | 49 ++++++-----
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        |  4 +-
 11 files changed, 201 insertions(+), 148 deletions(-)

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h
index 118fb9841d27e8..f46dc0c28cbc74 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenData.h
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.h
@@ -107,8 +107,8 @@ class CodeGenData {
   /// Global outlined hash tree that has oulined hash sequences across modules.
   std::unique_ptr<OutlinedHashTree> PublishedHashTree;
 
-  /// This flag is set when -fcgdata-generate is passed.
-  /// Or, it can be mutated with -ftwo-codegen-rounds during two codegen runs.
+  /// This flag is set when -fcodegen-data-generate is passed.
+  /// Or, it can be mutated with -fcodegen-data-thinlto-two-rounds.
   bool EmitCGData;
 
   /// This is a singleton instance which is thread-safe. Unlike profile data
@@ -174,7 +174,7 @@ namespace IndexedCGData {
 const uint64_t Magic = 0x81617461646763ff; // "\xffcgdata\x81"
 
 enum CGDataVersion {
-  // Version 1 is the first version. This version support the outlined
+  // Version 1 is the first version. This version supports the outlined
   // hash tree.
   Version1 = 1,
   CurrentVersion = CG_DATA_INDEX_VERSION
diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
index 1b08085dec2f25..ea73cce4894acc 100644
--- a/llvm/lib/CodeGenData/CodeGenDataReader.cpp
+++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
@@ -144,7 +144,7 @@ Error TextCodeGenDataReader::read() {
 
   // Parse the custom header line by line.
   while (Line->starts_with(":")) {
-    StringRef Str = Line->substr(1);
+    StringRef Str = Line->drop_front().rtrim();
     if (Str.equals_insensitive("outlined_hash_tree"))
       DataKind |= CGDataKind::FunctionOutlinedHashTree;
     else
@@ -152,7 +152,7 @@ Error TextCodeGenDataReader::read() {
     ++Line;
   }
 
-  // We treat an empty header (that as a comment # only) as a valid header.
+  // We treat an empty header (that is a comment # only) as a valid header.
   if (Line.is_at_eof()) {
     if (DataKind != CGDataKind::Unknown)
       return error(cgdata_error::bad_header);
@@ -160,7 +160,7 @@ Error TextCodeGenDataReader::read() {
   }
 
   // The YAML docs follow after the header.
-  const char *Pos = (*Line).data();
+  const char *Pos = Line->data();
   size_t Size = reinterpret_cast<size_t>(DataBuffer->getBufferEnd()) -
                 reinterpret_cast<size_t>(Pos);
   yaml::Input YOS(StringRef(Pos, Size));
diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
index 9aa0d86223f714..3c91a1b3034503 100644
--- a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
@@ -108,8 +108,8 @@ Error CodeGenDataWriter::writeHeader(CGDataOStream &COS) {
 
   Header.OutlinedHashTreeOffset = 0;
 
-  // Only write out up to the CGDataKind. We need to remember the offest of the
-  // remaing fields to allow back patching later.
+  // Only write up to the CGDataKind. We need to remember the offset of the
+  // remaining fields to allow back-patching later.
   COS.write(Header.Magic);
   COS.write32(Header.Version);
   COS.write32(Header.DataKind);
diff --git a/llvm/test/tools/llvm-cgdata/dump.test b/llvm/test/tools/llvm-cgdata/dump.test
index ce2ad27a5ff81c..20e0b654973c25 100644
--- a/llvm/test/tools/llvm-cgdata/dump.test
+++ b/llvm/test/tools/llvm-cgdata/dump.test
@@ -5,7 +5,9 @@ RUN: split-file %s %t
 RUN: llvm-cgdata dump -binary %t/dump.cgtext -o %t/dump.cgdata
 RUN: llvm-cgdata dump -text %t/dump.cgdata -o %t/dump-round.cgtext
 RUN: llvm-cgdata dump -binary %t/dump-round.cgtext -o %t/dump-round.cgdata
+RUN: llvm-cgdata dump -text %t/dump-round.cgtext -o %t/dump-round-round.cgtext
 RUN: diff %t/dump.cgdata %t/dump-round.cgdata
+RUN: diff %t/dump-round.cgtext %t/dump-round-round.cgtext
 
 ;--- dump.cgtext
 # Outlined stable hash tree
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
index d5e201b9eec17f..6e41f33ade9c39 100644
--- a/llvm/test/tools/llvm-cgdata/empty.test
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -1,22 +1,25 @@
+# Test no input file
+RUN: not llvm-cgdata dump -o - 2>&1 | FileCheck %s --check-prefix=NOFILE --ignore-case
+NOFILE: error: No such file or directory
+
 # Test for empty cgdata file, which is invalid.
 RUN: touch %t_emptyfile.cgtext
-RUN: not llvm-cgdata dump %t_emptyfile.cgtext -text -o - 2>&1 | FileCheck %s --check-prefix ERROR
-ERROR: {{.}}emptyfile.cgtext: empty codegen data
+RUN: not llvm-cgdata dump %t_emptyfile.cgtext -text 2>&1 | FileCheck %s --check-prefix=EMPTY
+EMPTY: {{.}}emptyfile.cgtext: empty codegen data
 
 # Test for empty header in the text format. It can be converted to a valid binary file.
 RUN: printf '#' > %t_emptyheader.cgtext
 RUN: llvm-cgdata dump %t_emptyheader.cgtext -binary -o %t_emptyheader.cgdata
 
 # Without any cgdata other than the header, no data shows by default.
-RUN: llvm-cgdata show %t_emptyheader.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
-EMPTY-NOT: any
+RUN: llvm-cgdata show %t_emptyheader.cgdata | count 0
 
 # The version number appears when asked, as it's in the header
-RUN: llvm-cgdata show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix VERSION
-VERSION: Version: {{.}}
+RUN: llvm-cgdata show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
+VERSION: Version: 1
 
 # When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
-RUN: llvm-cgdata dump %t_emptyheader.cgdata -text -o - | FileCheck %s --allow-empty --check-prefix EMPTY
+RUN: llvm-cgdata dump %t_emptyheader.cgdata -text | count 0
 
 # Synthesize a header only cgdata.
 # struct Header {
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
index 5e1b14de5e509d..4da22498ea3902 100644
--- a/llvm/test/tools/llvm-cgdata/error.test
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -8,17 +8,17 @@
 #   uint64_t OutlinedHashTreeOffset;
 # }
 RUN: touch %t_empty.cgdata
-RUN: not llvm-cgdata show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix EMPTY
+RUN: not llvm-cgdata show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix=EMPTY
 EMPTY: {{.}}cgdata: empty codegen data
 
 # Not a magic.
 RUN: printf '\xff' > %t_malformed.cgdata
-RUN: not llvm-cgdata show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix MALFORMED
+RUN: not llvm-cgdata show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix=MALFORMED
 MALFORMED: {{.}}cgdata: malformed codegen data
 
 # The minimum header size is 24.
 RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
-RUN: not llvm-cgdata show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix CORRUPT
+RUN: not llvm-cgdata show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
 CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
 
 # The current version 1 while the header says 2.
@@ -26,7 +26,7 @@ RUN: printf '\xffcgdata\x81' > %t_version.cgdata
 RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
-RUN: not llvm-cgdata show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix BAD_VERSION
+RUN: not llvm-cgdata show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix=BAD_VERSION
 BAD_VERSION: {{.}}cgdata: unsupported codegen data version
 
 # Header says an outlined hash tree, but the file ends after the header.
@@ -34,5 +34,5 @@ RUN: printf '\xffcgdata\x81' > %t_eof.cgdata
 RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
 RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
 RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
-RUN: not llvm-cgdata show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix EOF
+RUN: not llvm-cgdata show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix=EOF
 EOF: {{.}}cgdata: end of File
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
index a27d6c2a16f4ab..378e7a76058813 100644
--- a/llvm/test/tools/llvm-cgdata/merge-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -1,10 +1,26 @@
+# REQUIRES: shell
+# UNSUPPORTED: system-windows
+
 # Merge an archive that has two object files having cgdata (__llvm_outline)
 
 RUN: split-file %s %t
 
+# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+
+# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed -i "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+
+# Make an archive from two object files
 RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
+
+# Merge the archive into the codegen data file.
 RUN: llvm-cgdata merge %t/merge-archive.a -o %t/merge-archive.cgdata
 RUN: llvm-cgdata show %t/merge-archive.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
@@ -12,7 +28,7 @@ CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata dump %t/merge-archive.cgdata | FileCheck %s --check-prefix TREE
+RUN: llvm-cgdata dump %t/merge-archive.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
@@ -34,42 +50,41 @@ TREE-NEXT:   Terminals:       4
 TREE-NEXT:   SuccessorIds:    [  ]
 TREE-NEXT: ...
 
+;--- raw-1.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
+
 ;--- merge-1.ll
+ at .data = private unnamed_addr constant [72 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_outline"
 
-; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
-;---
-;0:
-;  Hash:            0x0
-;  Terminals:       0
-;  SuccessorIds:    [ 1 ]
-;1:
-;  Hash:            0x1
-;  Terminals:       0
-;  SuccessorIds:    [ 2 ]
-;2:
-;  Hash:            0x2
-;  Terminals:       4
-;  SuccessorIds:    [  ]
-;...
 
- at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+;--- raw-2.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x3
+  Terminals:       5
+  SuccessorIds:    [  ]
+...
 
 ;--- merge-2.ll
-
-; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
-;---
-;0:
-;  Hash:            0x0
-;  Terminals:       0
-;  SuccessorIds:    [ 1 ]
-;1:
-;  Hash:            0x1
-;  Terminals:       0
-;  SuccessorIds:    [ 2 ]
-;2:
-;  Hash:            0x3
-;  Terminals:       5
-;  SuccessorIds:    [  ]
-;...
-
- at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+ at .data = private unnamed_addr constant [72 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
index 3411133cb7aacb..4a361d86999c24 100644
--- a/llvm/test/tools/llvm-cgdata/merge-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -1,7 +1,19 @@
+# REQUIRES: shell
+# UNSUPPORTED: system-windows
+
 # Merge a binary file (e.g., a linked executable) having concatnated cgdata (__llvm_outline)
 
 RUN: split-file %s %t
 
+# Synthesize two set of raw cgdata without the header (24 byte) from the indexed cgdata.
+# Concatenate them in merge-concat.ll
+RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat.ll
+RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed -i "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat.ll
+
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
 RUN: llvm-cgdata merge %t/merge-concat.o -o %t/merge-concat.cgdata
 RUN: llvm-cgdata show %t/merge-concat.cgdata | FileCheck %s
@@ -10,7 +22,7 @@ CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata dump %t/merge-concat.cgdata | FileCheck %s --check-prefix TREE
+RUN: llvm-cgdata dump %t/merge-concat.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
@@ -32,37 +44,40 @@ TREE-NEXT:   Terminals:       4
 TREE-NEXT:   SuccessorIds:    [  ]
 TREE-NEXT: ...
 
-;--- merge-concat.ll
+;--- raw-1.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
 
-; In an linked executable (as opposed to an object file), cgdata in __llvm_outline might be concatenated. Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated. In other word, the following two trees are encoded back-to-back in a binary format.
-;---
-;0:
-;  Hash:            0x0
-;  Terminals:       0
-;  SuccessorIds:    [ 1 ]
-;1:
-;  Hash:            0x1
-;  Terminals:       0
-;  SuccessorIds:    [ 2 ]
-;2:
-;  Hash:            0x2
-;  Terminals:       4
-;  SuccessorIds:    [  ]
-;...
-;---
-;0:
-;  Hash:            0x0
-;  Terminals:       0
-;  SuccessorIds:    [ 1 ]
-;1:
-;  Hash:            0x1
-;  Terminals:       0
-;  SuccessorIds:    [ 2 ]
-;2:
-;  Hash:            0x3
-;  Terminals:       5
-;  SuccessorIds:    [  ]
-;...
+;--- raw-2.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x3
+  Terminals:       5
+  SuccessorIds:    [  ]
+...
+
+;--- merge-concat.ll
 
- at .data1 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
- at .data2 = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+; In an linked executable (as opposed to an object file), cgdata in __llvm_outline might be concatenated. Although this is not a typical workflow, we simply support this case to parse cgdata that is concatenated. In other words, the following two trees are encoded back-to-back in a binary format.
+ at .data1 = private unnamed_addr constant [72 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_outline"
+ at .data2 = private unnamed_addr constant [72 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
index 6ce358cd72325b..4fe9403aabdff1 100644
--- a/llvm/test/tools/llvm-cgdata/merge-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -1,17 +1,32 @@
+# REQUIRES: shell
+# UNSUPPORTED: system-windows
+
 # Merge two object files having cgdata (__llvm_outline)
 
 RUN: split-file %s %t
 
+# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
+
+# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed -i "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
+
+# Merge two object files into the codegen data file.
 RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+
 RUN: llvm-cgdata show %t/merge.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata dump %t/merge.cgdata | FileCheck %s --check-prefix TREE
+RUN: llvm-cgdata dump %t/merge.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
@@ -33,42 +48,40 @@ TREE-NEXT:   Terminals:       4
 TREE-NEXT:   SuccessorIds:    [  ]
 TREE-NEXT: ...
 
-;--- merge-1.ll
+;--- raw-1.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
 
-; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
-;---
-;0:
-;  Hash:            0x0
-;  Terminals:       0
-;  SuccessorIds:    [ 1 ]
-;1:
-;  Hash:            0x1
-;  Terminals:       0
-;  SuccessorIds:    [ 2 ]
-;2:
-;  Hash:            0x2
-;  Terminals:       4
-;  SuccessorIds:    [  ]
-;...
+;--- merge-1.ll
+ at .data = private unnamed_addr constant [72 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_outline"
 
- at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+;--- raw-2.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x3
+  Terminals:       5
+  SuccessorIds:    [  ]
+...
 
 ;--- merge-2.ll
-
-; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
-;---
-;0:
-;  Hash:            0x0
-;  Terminals:       0
-;  SuccessorIds:    [ 1 ]
-;1:
-;  Hash:            0x1
-;  Terminals:       0
-;  SuccessorIds:    [ 2 ]
-;2:
-;  Hash:            0x3
-;  Terminals:       5
-;  SuccessorIds:    [  ]
-;...
-
- at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\03\00\00\00\00\00\00\00\05\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+ at .data = private unnamed_addr constant [72 x i8] c"<RAW_2_BYTES>", section "__DATA,__llvm_outline"
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
index 73bdd9800dbe1d..a591ca89be9595 100644
--- a/llvm/test/tools/llvm-cgdata/merge-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -1,3 +1,6 @@
+# REQUIRES: shell
+# UNSUPPORTED: system-windows
+
 # Test merge a single object file into a cgdata
 
 RUN: split-file %s %t
@@ -5,12 +8,16 @@ RUN: split-file %s %t
 # Merge an object file that has no cgdata (__llvm_outline). It still produces a header only cgdata.
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-empty.ll -o %t/merge-empty.o
 RUN: llvm-cgdata merge %t/merge-empty.o -o %t/merge-empty.cgdata
-RUN: llvm-cgdata show %t/merge-empty.cgdata | FileCheck %s --allow-empty --check-prefix EMPTY
-EMPTY-NOT: any
+# No summary appear with the header only cgdata.
+RUN: llvm-cgdata show %t/merge-empty.cgdata | count 0
 
+# Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
+RUN: llvm-cgdata dump -binary %t/raw-single.cgtext -o %t/raw-single.cgdata
+RUN: od -t x1 -j 24 -An %t/raw-single.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-single-bytes.txt
+RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-single-bytes.txt)/g" %t/merge-single.ll
+RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
 
 # Merge an object file having cgdata (__llvm_outline)
-RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
 RUN: llvm-cgdata merge %t/merge-single.o -o %t/merge-single.cgdata
 RUN: llvm-cgdata show %t/merge-single.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
@@ -21,23 +28,21 @@ CHECK-NEXT:  Depth: 2
 ;--- merge-empty.ll
 @.data = private unnamed_addr constant [1 x i8] c"\01"
 
-;--- merge-single.ll
-
-; The .data is encoded in a binary form based on the following yaml form. See serialize() in OutlinedHashTreeRecord.cpp
-;---
-;0:
-;  Hash:            0x0
-;  Terminals:       0
-;  SuccessorIds:    [ 1 ]
-;1:
-;  Hash:            0x1
-;  Terminals:       0
-;  SuccessorIds:    [ 2 ]
-;2:
-;  Hash:            0x2
-;  Terminals:       4
-;  SuccessorIds:    [  ]
-;...
-
- at .data = private unnamed_addr constant [72 x i8] c"\03\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\01\00\00\00\00\00\00\00\00\00\00\00\01\00\00\00\02\00\00\00\02\00\00\00\02\00\00\00\00\00\00\00\04\00\00\00\00\00\00\00", section "__DATA,__llvm_outline"
+;--- raw-single.cgtext
+:outlined_hash_tree
+0:
+  Hash:            0x0
+  Terminals:       0
+  SuccessorIds:    [ 1 ]
+1:
+  Hash:            0x1
+  Terminals:       0
+  SuccessorIds:    [ 2 ]
+2:
+  Hash:            0x2
+  Terminals:       4
+  SuccessorIds:    [  ]
+...
 
+;--- merge-single.ll
+ at .data = private unnamed_addr constant [72 x i8] c"<RAW_1_BYTES>", section "__DATA,__llvm_outline"
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 195f066fd6b872..3303ffd9d863bd 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -61,8 +61,8 @@ cl::opt<CGDataFormat> OutputFormat(
     cl::values(clEnumValN(CD_Text, "text", "Text encoding"),
                clEnumValN(CD_Binary, "binary", "Binary encoding")));
 
-cl::opt<bool> ShowCGDataVersion("cgdata-version", cl::init(false),
-                                cl::desc("Show cgdata version. "),
+cl::opt<bool> ShowCGDataVersion("cgdata-version",
+                                cl::desc("Show cgdata version."),
                                 cl::sub(ShowSubcommand));
 
 static void exitWithError(Twine Message, std::string Whence = "",

>From 58decfcd75d33ee73c367eaaf3c1d08d2a2b9050 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at fb.com>
Date: Sun, 7 Jul 2024 09:01:47 +0900
Subject: [PATCH 03/12] test fix on macosx

---
 llvm/test/tools/llvm-cgdata/merge-archive.test | 8 ++++----
 llvm/test/tools/llvm-cgdata/merge-concat.test  | 8 ++++----
 llvm/test/tools/llvm-cgdata/merge-double.test  | 8 ++++----
 llvm/test/tools/llvm-cgdata/merge-single.test  | 5 +++--
 4 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
index 378e7a76058813..54e6b5f589e0c6 100644
--- a/llvm/test/tools/llvm-cgdata/merge-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -7,14 +7,14 @@ RUN: split-file %s %t
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
 RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-1-bytes.txt
-RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
+RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
 RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-2-bytes.txt
-RUN: sed -i "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
+RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
 # Make an archive from two object files
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
index 4a361d86999c24..13599c972d6f17 100644
--- a/llvm/test/tools/llvm-cgdata/merge-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -8,11 +8,11 @@ RUN: split-file %s %t
 # Synthesize two set of raw cgdata without the header (24 byte) from the indexed cgdata.
 # Concatenate them in merge-concat.ll
 RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-1-bytes.txt
-RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat.ll
+RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat.ll
 RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-2-bytes.txt
-RUN: sed -i "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat.ll
+RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat.ll
 
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
 RUN: llvm-cgdata merge %t/merge-concat.o -o %t/merge-concat.cgdata
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
index 4fe9403aabdff1..0259d57f1625af 100644
--- a/llvm/test/tools/llvm-cgdata/merge-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -7,14 +7,14 @@ RUN: split-file %s %t
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
 RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-1-bytes.txt
-RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
+RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
+RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
 RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-2-bytes.txt
-RUN: sed -i "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
+RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
+RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
 # Merge two object files into the codegen data file.
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
index a591ca89be9595..ecaafaa7d38e78 100644
--- a/llvm/test/tools/llvm-cgdata/merge-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -13,8 +13,9 @@ RUN: llvm-cgdata show %t/merge-empty.cgdata | count 0
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
 RUN: llvm-cgdata dump -binary %t/raw-single.cgtext -o %t/raw-single.cgdata
-RUN: od -t x1 -j 24 -An %t/raw-single.cgdata | tr -d '\n' | sed 's/ /\\\\/g' > %t/raw-single-bytes.txt
-RUN: sed -i "s/<RAW_1_BYTES>/$(cat %t/raw-single-bytes.txt)/g" %t/merge-single.ll
+RUN: od -t x1 -j 24 -An %t/raw-single.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-single-bytes.txt
+
+RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-single-bytes.txt)/g" %t/merge-single.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
 
 # Merge an object file having cgdata (__llvm_outline)

>From 4f932b9daca38155b87f00a68a9baa6a870dda12 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Wed, 10 Jul 2024 16:46:31 -0700
Subject: [PATCH 04/12] Address feedbacks from Alex

---
 llvm/include/llvm/CodeGenData/CodeGenData.h | 2 +-
 llvm/lib/CodeGenData/CodeGenDataReader.cpp  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h
index f46dc0c28cbc74..934ad20dd652db 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenData.h
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.h
@@ -81,7 +81,7 @@ class CGDataError : public ErrorInfo<CGDataError> {
   /// contain a single CGDataError.
   static std::pair<cgdata_error, std::string> take(Error E) {
     auto Err = cgdata_error::success;
-    std::string Msg = "";
+    std::string Msg;
     handleAllErrors(std::move(E), [&Err, &Msg](const CGDataError &IPE) {
       assert(Err == cgdata_error::success && "Multiple errors encountered");
       Err = IPE.get();
diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
index ea73cce4894acc..9ef182deda55eb 100644
--- a/llvm/lib/CodeGenData/CodeGenDataReader.cpp
+++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
@@ -121,7 +121,7 @@ CodeGenDataReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
 
 bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) {
   using namespace support;
-  if (DataBuffer.getBufferSize() < 8)
+  if (DataBuffer.getBufferSize() < sizeof(IndexedCGData::Magic))
     return false;
 
   uint64_t Magic = endian::read<uint64_t, llvm::endianness::little, aligned>(

>From 93de5482108be18e225e3362b9cbe429619891fa Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 20 Jul 2024 18:48:01 -0700
Subject: [PATCH 05/12] address comments from Vincent

---
 llvm/include/llvm/CodeGenData/CodeGenData.h   | 4 +++-
 llvm/test/tools/llvm-cgdata/merge-concat.test | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CodeGenData/CodeGenData.h
index 934ad20dd652db..659008c78abd93 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenData.h
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.h
@@ -171,7 +171,9 @@ void warn(Twine Message, std::string Whence = "", std::string Hint = "");
 
 namespace IndexedCGData {
 
-const uint64_t Magic = 0x81617461646763ff; // "\xffcgdata\x81"
+// A signature for data validation, representing "\xffcgdata\x81" in
+// little-endian order
+const uint64_t Magic = 0x81617461646763ff;
 
 enum CGDataVersion {
   // Version 1 is the first version. This version supports the outlined
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
index 13599c972d6f17..0113bccbfedf9e 100644
--- a/llvm/test/tools/llvm-cgdata/merge-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -1,11 +1,11 @@
 # REQUIRES: shell
 # UNSUPPORTED: system-windows
 
-# Merge a binary file (e.g., a linked executable) having concatnated cgdata (__llvm_outline)
+# Merge a binary file (e.g., a linked executable) having concatenated cgdata (__llvm_outline)
 
 RUN: split-file %s %t
 
-# Synthesize two set of raw cgdata without the header (24 byte) from the indexed cgdata.
+# Synthesize two sets of raw cgdata without the header (24 byte) from the indexed cgdata.
 # Concatenate them in merge-concat.ll
 RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt

>From 6e528b727b256b99e758ab93c170c4ad0cb0361b Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Sat, 20 Jul 2024 19:45:59 -0700
Subject: [PATCH 06/12] Address comments from Ellis #2

---
 llvm/include/llvm/CodeGenData/CodeGenData.inc | 10 +++---
 llvm/lib/CodeGenData/CodeGenData.cpp          |  5 ++-
 llvm/lib/CodeGenData/CodeGenDataReader.cpp    | 36 +++++++++----------
 3 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.inc b/llvm/include/llvm/CodeGenData/CodeGenData.inc
index 5f6df5c0bf1065..08ec14ea051a0c 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenData.inc
+++ b/llvm/include/llvm/CodeGenData/CodeGenData.inc
@@ -12,6 +12,10 @@
  *
 \*===----------------------------------------------------------------------===*/
 
+/* Helper macros.  */
+#define CG_DATA_SIMPLE_QUOTE(x) #x
+#define CG_DATA_QUOTE(x) CG_DATA_SIMPLE_QUOTE(x)
+
 #ifdef CG_DATA_SECT_ENTRY
 #define CG_DATA_DEFINED
 CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON),
@@ -34,13 +38,9 @@ CG_DATA_SECT_ENTRY(CG_outline, CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON),
 
 #else
 /* Runtime section names and name strings.  */
-#define CG_DATA_SECT_NAME INSTR_PROF_QUOTE(CG_DATA_OUTLINE_COMMON)
+#define CG_DATA_SECT_NAME CG_DATA_QUOTE(CG_DATA_OUTLINE_COMMON)
 
 #endif
 
 /* Indexed codegen data format version (start from 1). */
 #define CG_DATA_INDEX_VERSION 1
-
-/* Helper macros.  */
-#define CG_DATA_SIMPLE_QUOTE(x) #x
-#define CG_DATA_QUOTE(x) CG_DATA_SIMPLE_QUOTE(x)
diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CodeGenData/CodeGenData.cpp
index 3bd21c97c7de7a..49b7447440959f 100644
--- a/llvm/lib/CodeGenData/CodeGenData.cpp
+++ b/llvm/lib/CodeGenData/CodeGenData.cpp
@@ -130,8 +130,7 @@ std::once_flag CodeGenData::OnceFlag;
 
 CodeGenData &CodeGenData::getInstance() {
   std::call_once(CodeGenData::OnceFlag, []() {
-    auto *CGD = new CodeGenData();
-    Instance.reset(CGD);
+    Instance = std::unique_ptr<CodeGenData>(new CodeGenData());
 
     // TODO: Initialize writer or reader mode for the client optimization.
   });
@@ -187,7 +186,7 @@ void warn(Twine Message, std::string Whence, std::string Hint) {
 void warn(Error E, StringRef Whence) {
   if (E.isA<CGDataError>()) {
     handleAllErrors(std::move(E), [&](const CGDataError &IPE) {
-      warn(IPE.message(), std::string(Whence), std::string(""));
+      warn(IPE.message(), Whence.str(), "");
     });
   }
 }
diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
index 9ef182deda55eb..2174e2ab5783ea 100644
--- a/llvm/lib/CodeGenData/CodeGenDataReader.cpp
+++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
@@ -47,7 +47,7 @@ Error CodeGenDataReader::mergeFromObjectFile(
     auto *EndData = Data + ContentsOrErr->size();
 
     if (*NameOrErr == CGOutLineName) {
-      // In case dealing with an executable that has concatenaed cgdata,
+      // In case dealing with an executable that has concatenated cgdata,
       // we want to merge them into a single cgdata.
       // Although it's not a typical workflow, we support this scenario.
       while (Data != EndData) {
@@ -74,10 +74,8 @@ Error IndexedCodeGenDataReader::read() {
       reinterpret_cast<const unsigned char *>(DataBuffer->getBufferStart());
   auto *End =
       reinterpret_cast<const unsigned char *>(DataBuffer->getBufferEnd());
-  auto HeaderOr = IndexedCGData::Header::readFromBuffer(Start);
-  if (!HeaderOr)
-    return HeaderOr.takeError();
-  Header = HeaderOr.get();
+  if (auto E = IndexedCGData::Header::readFromBuffer(Start).moveInto(Header))
+    return std::move(E);
 
   if (hasOutlinedHashTree()) {
     const unsigned char *Ptr = Start + Header.OutlinedHashTreeOffset;
@@ -106,9 +104,9 @@ CodeGenDataReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   std::unique_ptr<CodeGenDataReader> Reader;
   // Create the reader.
   if (IndexedCodeGenDataReader::hasFormat(*Buffer))
-    Reader.reset(new IndexedCodeGenDataReader(std::move(Buffer)));
+    Reader = std::make_unique<IndexedCodeGenDataReader>(std::move(Buffer));
   else if (TextCodeGenDataReader::hasFormat(*Buffer))
-    Reader.reset(new TextCodeGenDataReader(std::move(Buffer)));
+    Reader = std::make_unique<TextCodeGenDataReader>(std::move(Buffer));
   else
     return make_error<CGDataError>(cgdata_error::malformed);
 
@@ -132,31 +130,33 @@ bool IndexedCodeGenDataReader::hasFormat(const MemoryBuffer &DataBuffer) {
 
 bool TextCodeGenDataReader::hasFormat(const MemoryBuffer &Buffer) {
   // Verify that this really looks like plain ASCII text by checking a
-  // 'reasonable' number of characters (up to profile magic size).
-  size_t count = std::min(Buffer.getBufferSize(), sizeof(uint64_t));
-  StringRef buffer = Buffer.getBufferStart();
-  return count == 0 ||
-         std::all_of(buffer.begin(), buffer.begin() + count,
-                     [](char c) { return isPrint(c) || isSpace(c); });
+  // 'reasonable' number of characters (up to the magic size).
+  StringRef Prefix = Buffer.getBuffer().take_front(sizeof(uint64_t));
+  return llvm::all_of(Prefix, [](char c) { return isPrint(c) || isSpace(c); });
 }
 Error TextCodeGenDataReader::read() {
   using namespace support;
 
   // Parse the custom header line by line.
-  while (Line->starts_with(":")) {
+  for (; !Line.is_at_eof(); ++Line) {
+    // Skip empty or whitespace-only lines
+    if (Line->trim().empty())
+      continue;
+
+    if (!Line->starts_with(":"))
+      break;
     StringRef Str = Line->drop_front().rtrim();
     if (Str.equals_insensitive("outlined_hash_tree"))
       DataKind |= CGDataKind::FunctionOutlinedHashTree;
     else
       return error(cgdata_error::bad_header);
-    ++Line;
   }
 
   // We treat an empty header (that is a comment # only) as a valid header.
   if (Line.is_at_eof()) {
-    if (DataKind != CGDataKind::Unknown)
-      return error(cgdata_error::bad_header);
-    return Error::success();
+    if (DataKind == CGDataKind::Unknown)
+      return Error::success();
+    return error(cgdata_error::bad_header);
   }
 
   // The YAML docs follow after the header.

>From 911820bc19a131da41d4915aec9145121ff44a30 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at fb.com>
Date: Wed, 24 Jul 2024 09:27:06 +0900
Subject: [PATCH 07/12] Fix for build breaks

---
 llvm/lib/CodeGenData/CodeGenDataReader.cpp     | 3 ++-
 llvm/test/tools/llvm-cgdata/merge-archive.test | 2 +-
 llvm/test/tools/llvm-cgdata/merge-concat.test  | 2 +-
 llvm/test/tools/llvm-cgdata/merge-double.test  | 2 +-
 llvm/test/tools/llvm-cgdata/merge-single.test  | 2 +-
 5 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
index 2174e2ab5783ea..bcd61047079ffa 100644
--- a/llvm/lib/CodeGenData/CodeGenDataReader.cpp
+++ b/llvm/lib/CodeGenData/CodeGenDataReader.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/CodeGenData/CodeGenDataReader.h"
 #include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
 
 #define DEBUG_TYPE "cg-data-reader"
@@ -75,7 +76,7 @@ Error IndexedCodeGenDataReader::read() {
   auto *End =
       reinterpret_cast<const unsigned char *>(DataBuffer->getBufferEnd());
   if (auto E = IndexedCGData::Header::readFromBuffer(Start).moveInto(Header))
-    return std::move(E);
+    return E;
 
   if (hasOutlinedHashTree()) {
     const unsigned char *Ptr = Start + Header.OutlinedHashTreeOffset;
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
index 54e6b5f589e0c6..e17422fdcd140f 100644
--- a/llvm/test/tools/llvm-cgdata/merge-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -1,4 +1,4 @@
-# REQUIRES: shell
+# REQUIRES: shell, aarch64-registered-target
 # UNSUPPORTED: system-windows
 
 # Merge an archive that has two object files having cgdata (__llvm_outline)
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
index 0113bccbfedf9e..68ce20503e58de 100644
--- a/llvm/test/tools/llvm-cgdata/merge-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -1,4 +1,4 @@
-# REQUIRES: shell
+# REQUIRES: shell, aarch64-registered-target
 # UNSUPPORTED: system-windows
 
 # Merge a binary file (e.g., a linked executable) having concatenated cgdata (__llvm_outline)
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
index 0259d57f1625af..c88e95ba68ea42 100644
--- a/llvm/test/tools/llvm-cgdata/merge-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -1,4 +1,4 @@
-# REQUIRES: shell
+# REQUIRES: shell, aarch64-registered-target
 # UNSUPPORTED: system-windows
 
 # Merge two object files having cgdata (__llvm_outline)
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
index ecaafaa7d38e78..37532eff6b9c89 100644
--- a/llvm/test/tools/llvm-cgdata/merge-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -1,4 +1,4 @@
-# REQUIRES: shell
+# REQUIRES: shell, aarch64-registered-target
 # UNSUPPORTED: system-windows
 
 # Test merge a single object file into a cgdata

>From fd5669480563855a9d73b824213223703f690efe Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Wed, 31 Jul 2024 22:51:04 -0700
Subject: [PATCH 08/12] Use Opt to re-configure llvm-cgdata

- Action options are required, starting with : convert, show, merge.
- -format option is added, to generalize an output option: text, or binary.
---
 .../llvm/CodeGenData/CodeGenDataReader.h      |   2 +-
 .../llvm-cgdata/{dump.test => convert.test}   |   8 +-
 llvm/test/tools/llvm-cgdata/empty.test        |  14 +-
 llvm/test/tools/llvm-cgdata/error.test        |  10 +-
 .../test/tools/llvm-cgdata/merge-archive.test |  10 +-
 llvm/test/tools/llvm-cgdata/merge-concat.test |  10 +-
 llvm/test/tools/llvm-cgdata/merge-double.test |  10 +-
 llvm/test/tools/llvm-cgdata/merge-single.test |  10 +-
 llvm/test/tools/llvm-cgdata/show.test         |   6 +-
 llvm/tools/llvm-cgdata/CMakeLists.txt         |   6 +
 llvm/tools/llvm-cgdata/Opts.td                |  28 +++
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        | 232 ++++++++++++------
 12 files changed, 232 insertions(+), 114 deletions(-)
 rename llvm/test/tools/llvm-cgdata/{dump.test => convert.test} (62%)
 create mode 100644 llvm/tools/llvm-cgdata/Opts.td

diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
index df4ae3ed24e79a..87602adcaf0a56 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
@@ -53,7 +53,7 @@ class CodeGenDataReader {
 
   /// Extract the cgdata embedded in sections from the given object file and
   /// merge them into the GlobalOutlineRecord. This is a static helper that
-  /// is used by `llvm-cgdata merge` or ThinLTO's two-codegen rounds.
+  /// is used by `llvm-cgdata -merge` or ThinLTO's two-codegen rounds.
   static Error mergeFromObjectFile(const object::ObjectFile *Obj,
                                    OutlinedHashTreeRecord &GlobalOutlineRecord);
 
diff --git a/llvm/test/tools/llvm-cgdata/dump.test b/llvm/test/tools/llvm-cgdata/convert.test
similarity index 62%
rename from llvm/test/tools/llvm-cgdata/dump.test
rename to llvm/test/tools/llvm-cgdata/convert.test
index 20e0b654973c25..0227bc5116cac7 100644
--- a/llvm/test/tools/llvm-cgdata/dump.test
+++ b/llvm/test/tools/llvm-cgdata/convert.test
@@ -2,10 +2,10 @@
 
 RUN: split-file %s %t
 
-RUN: llvm-cgdata dump -binary %t/dump.cgtext -o %t/dump.cgdata
-RUN: llvm-cgdata dump -text %t/dump.cgdata -o %t/dump-round.cgtext
-RUN: llvm-cgdata dump -binary %t/dump-round.cgtext -o %t/dump-round.cgdata
-RUN: llvm-cgdata dump -text %t/dump-round.cgtext -o %t/dump-round-round.cgtext
+RUN: llvm-cgdata -convert -format binary %t/dump.cgtext -o %t/dump.cgdata
+RUN: llvm-cgdata -convert -format text %t/dump.cgdata -o %t/dump-round.cgtext
+RUN: llvm-cgdata -convert -format binary %t/dump-round.cgtext -o %t/dump-round.cgdata
+RUN: llvm-cgdata -convert -format text %t/dump-round.cgtext -o %t/dump-round-round.cgtext
 RUN: diff %t/dump.cgdata %t/dump-round.cgdata
 RUN: diff %t/dump-round.cgtext %t/dump-round-round.cgtext
 
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
index 6e41f33ade9c39..cb0af2480244ed 100644
--- a/llvm/test/tools/llvm-cgdata/empty.test
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -1,25 +1,25 @@
 # Test no input file
-RUN: not llvm-cgdata dump -o - 2>&1 | FileCheck %s --check-prefix=NOFILE --ignore-case
-NOFILE: error: No such file or directory
+RUN: not llvm-cgdata -convert -o - 2>&1 | FileCheck %s --check-prefix=NOFILE --ignore-case
+NOFILE: error: No input file is specified.
 
 # Test for empty cgdata file, which is invalid.
 RUN: touch %t_emptyfile.cgtext
-RUN: not llvm-cgdata dump %t_emptyfile.cgtext -text 2>&1 | FileCheck %s --check-prefix=EMPTY
+RUN: not llvm-cgdata -convert %t_emptyfile.cgtext -format text 2>&1 | FileCheck %s --check-prefix=EMPTY
 EMPTY: {{.}}emptyfile.cgtext: empty codegen data
 
 # Test for empty header in the text format. It can be converted to a valid binary file.
 RUN: printf '#' > %t_emptyheader.cgtext
-RUN: llvm-cgdata dump %t_emptyheader.cgtext -binary -o %t_emptyheader.cgdata
+RUN: llvm-cgdata -convert %t_emptyheader.cgtext -format binary -o %t_emptyheader.cgdata
 
 # Without any cgdata other than the header, no data shows by default.
-RUN: llvm-cgdata show %t_emptyheader.cgdata | count 0
+RUN: llvm-cgdata -show %t_emptyheader.cgdata | count 0
 
 # The version number appears when asked, as it's in the header
-RUN: llvm-cgdata show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
+RUN: llvm-cgdata -show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
 VERSION: Version: 1
 
 # When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
-RUN: llvm-cgdata dump %t_emptyheader.cgdata -text | count 0
+RUN: llvm-cgdata -convert %t_emptyheader.cgdata -format text | count 0
 
 # Synthesize a header only cgdata.
 # struct Header {
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
index 4da22498ea3902..ef53070b1da875 100644
--- a/llvm/test/tools/llvm-cgdata/error.test
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -8,17 +8,17 @@
 #   uint64_t OutlinedHashTreeOffset;
 # }
 RUN: touch %t_empty.cgdata
-RUN: not llvm-cgdata show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix=EMPTY
+RUN: not llvm-cgdata -show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix=EMPTY
 EMPTY: {{.}}cgdata: empty codegen data
 
 # Not a magic.
 RUN: printf '\xff' > %t_malformed.cgdata
-RUN: not llvm-cgdata show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix=MALFORMED
+RUN: not llvm-cgdata -show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix=MALFORMED
 MALFORMED: {{.}}cgdata: malformed codegen data
 
 # The minimum header size is 24.
 RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
-RUN: not llvm-cgdata show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
+RUN: not llvm-cgdata -show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
 CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
 
 # The current version 1 while the header says 2.
@@ -26,7 +26,7 @@ RUN: printf '\xffcgdata\x81' > %t_version.cgdata
 RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
-RUN: not llvm-cgdata show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix=BAD_VERSION
+RUN: not llvm-cgdata -show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix=BAD_VERSION
 BAD_VERSION: {{.}}cgdata: unsupported codegen data version
 
 # Header says an outlined hash tree, but the file ends after the header.
@@ -34,5 +34,5 @@ RUN: printf '\xffcgdata\x81' > %t_eof.cgdata
 RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
 RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
 RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
-RUN: not llvm-cgdata show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix=EOF
+RUN: not llvm-cgdata -show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix=EOF
 EOF: {{.}}cgdata: end of File
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
index e17422fdcd140f..9f778d074bd072 100644
--- a/llvm/test/tools/llvm-cgdata/merge-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -6,13 +6,13 @@
 RUN: split-file %s %t
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: llvm-cgdata -convert -format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: llvm-cgdata -convert -format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
@@ -21,14 +21,14 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
 
 # Merge the archive into the codegen data file.
-RUN: llvm-cgdata merge %t/merge-archive.a -o %t/merge-archive.cgdata
-RUN: llvm-cgdata show %t/merge-archive.cgdata | FileCheck %s
+RUN: llvm-cgdata -merge %t/merge-archive.a -o %t/merge-archive.cgdata
+RUN: llvm-cgdata -show %t/merge-archive.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata dump %t/merge-archive.cgdata | FileCheck %s --check-prefix=TREE
+RUN: llvm-cgdata -convert %t/merge-archive.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
index 68ce20503e58de..6da8a102b01697 100644
--- a/llvm/test/tools/llvm-cgdata/merge-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -7,22 +7,22 @@ RUN: split-file %s %t
 
 # Synthesize two sets of raw cgdata without the header (24 byte) from the indexed cgdata.
 # Concatenate them in merge-concat.ll
-RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: llvm-cgdata -convert -format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat.ll
-RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: llvm-cgdata -convert -format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat.ll
 
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
-RUN: llvm-cgdata merge %t/merge-concat.o -o %t/merge-concat.cgdata
-RUN: llvm-cgdata show %t/merge-concat.cgdata | FileCheck %s
+RUN: llvm-cgdata -merge %t/merge-concat.o -o %t/merge-concat.cgdata
+RUN: llvm-cgdata -show %t/merge-concat.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata dump %t/merge-concat.cgdata | FileCheck %s --check-prefix=TREE
+RUN: llvm-cgdata -convert %t/merge-concat.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
index c88e95ba68ea42..e55d3cfa11020e 100644
--- a/llvm/test/tools/llvm-cgdata/merge-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -6,27 +6,27 @@
 RUN: split-file %s %t
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata dump -binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: llvm-cgdata -convert -format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata dump -binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: llvm-cgdata -convert -format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
 # Merge two object files into the codegen data file.
-RUN: llvm-cgdata merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+RUN: llvm-cgdata -merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
 
-RUN: llvm-cgdata show %t/merge.cgdata | FileCheck %s
+RUN: llvm-cgdata -show %t/merge.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata dump %t/merge.cgdata | FileCheck %s --check-prefix=TREE
+RUN: llvm-cgdata -convert %t/merge.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
index 37532eff6b9c89..1f44fc06149581 100644
--- a/llvm/test/tools/llvm-cgdata/merge-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -7,20 +7,20 @@ RUN: split-file %s %t
 
 # Merge an object file that has no cgdata (__llvm_outline). It still produces a header only cgdata.
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-empty.ll -o %t/merge-empty.o
-RUN: llvm-cgdata merge %t/merge-empty.o -o %t/merge-empty.cgdata
+RUN: llvm-cgdata -merge %t/merge-empty.o -o %t/merge-empty.cgdata
 # No summary appear with the header only cgdata.
-RUN: llvm-cgdata show %t/merge-empty.cgdata | count 0
+RUN: llvm-cgdata -show %t/merge-empty.cgdata | count 0
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata dump -binary %t/raw-single.cgtext -o %t/raw-single.cgdata
+RUN: llvm-cgdata -convert -format binary %t/raw-single.cgtext -o %t/raw-single.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-single.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-single-bytes.txt
 
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-single-bytes.txt)/g" %t/merge-single.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
 
 # Merge an object file having cgdata (__llvm_outline)
-RUN: llvm-cgdata merge %t/merge-single.o -o %t/merge-single.cgdata
-RUN: llvm-cgdata show %t/merge-single.cgdata | FileCheck %s
+RUN: llvm-cgdata -merge %t/merge-single.o -o %t/merge-single.cgdata
+RUN: llvm-cgdata -show %t/merge-single.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 3
 CHECK-NEXT:  Terminal Node Count: 1
diff --git a/llvm/test/tools/llvm-cgdata/show.test b/llvm/test/tools/llvm-cgdata/show.test
index accb4b77ede246..fc57229647bd30 100644
--- a/llvm/test/tools/llvm-cgdata/show.test
+++ b/llvm/test/tools/llvm-cgdata/show.test
@@ -1,7 +1,7 @@
 # Test show
 
 RUN: split-file %s %t
-RUN: llvm-cgdata show %t/show.cgtext | FileCheck %s
+RUN: llvm-cgdata -show %t/show.cgtext | FileCheck %s
 
 CHECK: Outlined hash tree:
 CHECK-NEXT:   Total Node Count: 3
@@ -9,8 +9,8 @@ CHECK-NEXT:   Terminal Node Count: 1
 CHECK-NEXT:   Depth: 2
 
 # Convert the text file to the binary file
-RUN: llvm-cgdata dump -binary %t/show.cgtext -o %t/show.cgdata
-RUN: llvm-cgdata show %t/show.cgdata | FileCheck %s
+RUN: llvm-cgdata -convert -format binary %t/show.cgtext -o %t/show.cgdata
+RUN: llvm-cgdata -show %t/show.cgdata | FileCheck %s
 
 ;--- show.cgtext
 :outlined_hash_tree
diff --git a/llvm/tools/llvm-cgdata/CMakeLists.txt b/llvm/tools/llvm-cgdata/CMakeLists.txt
index 4f1f7ff635bc3c..275fa107a8bbb9 100644
--- a/llvm/tools/llvm-cgdata/CMakeLists.txt
+++ b/llvm/tools/llvm-cgdata/CMakeLists.txt
@@ -3,13 +3,19 @@ set(LLVM_LINK_COMPONENTS
   CodeGenData
   Core
   Object
+  Option
   Support
   )
 
+set(LLVM_TARGET_DEFINITIONS Opts.td)
+tablegen(LLVM Opts.inc -gen-opt-parser-defs)
+add_public_tablegen_target(CGDataOptsTableGen)
+
 add_llvm_tool(llvm-cgdata
   llvm-cgdata.cpp
 
   DEPENDS
   intrinsics_gen
+  CGDataOptsTableGen
   GENERATE_DRIVER
   )
diff --git a/llvm/tools/llvm-cgdata/Opts.td b/llvm/tools/llvm-cgdata/Opts.td
new file mode 100644
index 00000000000000..c3b374e8e24350
--- /dev/null
+++ b/llvm/tools/llvm-cgdata/Opts.td
@@ -0,0 +1,28 @@
+include "llvm/Option/OptParser.td"
+
+class F<string letter, string help> : Flag<["-"], letter>, HelpText<help>;
+
+// General options
+def generic_group : OptionGroup<"Genric Options">, HelpText<"Generic Options">;
+def help : F<"help", "Display this help">, Group<generic_group>;
+def : Flag<["-"], "h">, Alias<help>, HelpText<"Alias for --help">, Group<generic_group>;
+def version : F<"version", "Display the version">, Group<generic_group>;
+def : Flag<["-"], "v">, Alias<version>, HelpText<"Alias for --version">, Group<generic_group>;
+
+// Action options
+def action_group : OptionGroup<"Action">, HelpText<"Action (required)">;
+def show : F<"show", "Show summary of the (indexed) codegen data file.">,
+  Group<action_group>;
+def convert : F<"convert", "Convert the (indexed) codegen data file in either text or binary format.">,
+  Group<action_group>;
+def merge : F<"merge", "Takes binary files having raw codegen data in custom sections, and merge them into an index codegen data file.">,
+  Group<action_group>;
+def cgdata_version : Flag<["-", "--"], "cgdata-version">, HelpText<"Display the cgdata version">;
+
+// Output options
+def output : Option<["-", "--"], "output", KIND_SEPARATE>,
+             HelpText<"Create output file with specified name">, MetaVarName<"<file>">;
+def o : JoinedOrSeparate<["-"], "o">, Alias<output>;
+def format : Option<["-", "--"], "format", KIND_SEPARATE>,
+             HelpText<"Specify the output format (text or binary)">, MetaVarName<"<value>">;
+def f : JoinedOrSeparate<["-"], "f">, Alias<format>;
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 3303ffd9d863bd..598e4bbac60a30 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -17,6 +17,9 @@
 #include "llvm/CodeGenData/CodeGenDataWriter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/Archive.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Option/ArgList.h"
+#include "llvm/Option/Option.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/LLVMDriver.h"
 #include "llvm/Support/Path.h"
@@ -27,43 +30,57 @@
 using namespace llvm;
 using namespace llvm::object;
 
-// TODO: https://llvm.org/docs/CommandGuide/llvm-cgdata.html has documentations
-// on each subcommand.
-cl::SubCommand DumpSubcommand(
-    "dump",
-    "Dump the (indexed) codegen data file in either text or binary format.");
-cl::SubCommand MergeSubcommand(
-    "merge", "Takes binary files having raw codegen data in custom sections, "
-             "and merge them into an index codegen data file.");
-cl::SubCommand
-    ShowSubcommand("show", "Show summary of the (indexed) codegen data file.");
-
 enum CGDataFormat {
-  CD_None = 0,
-  CD_Text,
-  CD_Binary,
+  Invalid,
+  Text,
+  Binary,
+};
+
+enum CGDataAction {
+  Convert,
+  Merge,
+  Show,
+};
+
+// Command-line option boilerplate.
+namespace {
+enum ID {
+  OPT_INVALID = 0, // This is not an option ID.
+#define OPTION(...) LLVM_MAKE_OPT_ID(__VA_ARGS__),
+#include "Opts.inc"
+#undef OPTION
+};
+
+#define PREFIX(NAME, VALUE)                                                    \
+  static constexpr StringLiteral NAME##_init[] = VALUE;                        \
+  static constexpr ArrayRef<StringLiteral> NAME(NAME##_init,                   \
+                                                std::size(NAME##_init) - 1);
+#include "Opts.inc"
+#undef PREFIX
+
+using namespace llvm::opt;
+static constexpr opt::OptTable::Info InfoTable[] = {
+#define OPTION(...) LLVM_CONSTRUCT_OPT_INFO(__VA_ARGS__),
+#include "Opts.inc"
+#undef OPTION
+};
+
+class CGDataOptTable : public opt::GenericOptTable {
+public:
+  CGDataOptTable() : GenericOptTable(InfoTable) {}
 };
+} // end anonymous namespace
+
+// Options
+static std::string ToolName;
+static std::string OutputFilename = "-";
+static std::string Filename;
+static bool ShowCGDataVersion;
+static CGDataAction Action;
+static std::optional<CGDataFormat> OutputFormat;
+static std::vector<std::string> InputFilenames;
 
-cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
-                                    cl::init("-"), cl::desc("Output file"),
-                                    cl::sub(DumpSubcommand),
-                                    cl::sub(MergeSubcommand));
-cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
-                          cl::aliasopt(OutputFilename));
-
-cl::opt<std::string> Filename(cl::Positional, cl::desc("<cgdata-file>"),
-                              cl::sub(DumpSubcommand), cl::sub(ShowSubcommand));
-cl::list<std::string> InputFilenames(cl::Positional, cl::sub(MergeSubcommand),
-                                     cl::desc("<binary-files...>"));
-cl::opt<CGDataFormat> OutputFormat(
-    cl::desc("Format of output data"), cl::sub(DumpSubcommand),
-    cl::init(CD_Text),
-    cl::values(clEnumValN(CD_Text, "text", "Text encoding"),
-               clEnumValN(CD_Binary, "binary", "Binary encoding")));
-
-cl::opt<bool> ShowCGDataVersion("cgdata-version",
-                                cl::desc("Show cgdata version."),
-                                cl::sub(ShowSubcommand));
+// TODO: Add a doc, https://llvm.org/docs/CommandGuide/llvm-cgdata.html
 
 static void exitWithError(Twine Message, std::string Whence = "",
                           std::string Hint = "") {
@@ -91,17 +108,12 @@ static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") {
   exitWithError(EC.message(), std::string(Whence));
 }
 
-static int dump_main(int argc, const char *argv[]) {
-  if (Filename == OutputFilename) {
-    errs() << sys::path::filename(argv[0]) << " " << argv[1]
-           << ": Input file name cannot be the same as the output file name!\n";
-    return 1;
-  }
-
+static int convert_main(int argc, const char *argv[]) {
   std::error_code EC;
-  raw_fd_ostream OS(OutputFilename.data(), EC,
-                    OutputFormat == CD_Text ? sys::fs::OF_TextWithCRLF
-                                            : sys::fs::OF_None);
+  raw_fd_ostream OS(OutputFilename, EC,
+                    OutputFormat == CGDataFormat::Text
+                        ? sys::fs::OF_TextWithCRLF
+                        : sys::fs::OF_None);
   if (EC)
     exitWithErrorCode(EC, OutputFilename);
 
@@ -117,7 +129,7 @@ static int dump_main(int argc, const char *argv[]) {
     Writer.addRecord(Record);
   }
 
-  if (OutputFormat == CD_Text) {
+  if (OutputFormat == CGDataFormat::Text) {
     if (Error E = Writer.writeText(OS))
       exitWithError(std::move(E));
   } else {
@@ -152,7 +164,8 @@ static bool handleArchive(StringRef Filename, Archive &Arch,
 
 static bool handleBuffer(StringRef Filename, MemoryBufferRef Buffer,
                          OutlinedHashTreeRecord &GlobalOutlineRecord) {
-  Expected<std::unique_ptr<Binary>> BinOrErr = object::createBinary(Buffer);
+  Expected<std::unique_ptr<object::Binary>> BinOrErr =
+      object::createBinary(Buffer);
   if (Error E = BinOrErr.takeError())
     exitWithError(std::move(E), Filename);
 
@@ -187,33 +200,33 @@ static int merge_main(int argc, const char *argv[]) {
   for (auto &Filename : InputFilenames)
     Result &= handleFile(Filename, GlobalOutlineRecord);
 
-  if (!Result) {
-    errs() << "Error: failed to merge codegen data files.\n";
-    return 1;
-  }
+  if (!Result)
+    exitWithError("failed to merge codegen data files.");
 
   CodeGenDataWriter Writer;
   if (!GlobalOutlineRecord.empty())
     Writer.addRecord(GlobalOutlineRecord);
 
   std::error_code EC;
-  raw_fd_ostream Output(OutputFilename, EC, sys::fs::OF_None);
+  raw_fd_ostream OS(OutputFilename, EC,
+                    OutputFormat == CGDataFormat::Text
+                        ? sys::fs::OF_TextWithCRLF
+                        : sys::fs::OF_None);
   if (EC)
     exitWithErrorCode(EC, OutputFilename);
 
-  if (auto E = Writer.write(Output))
-    exitWithError(std::move(E));
+  if (OutputFormat == CGDataFormat::Text) {
+    if (Error E = Writer.writeText(OS))
+      exitWithError(std::move(E));
+  } else {
+    if (Error E = Writer.write(OS))
+      exitWithError(std::move(E));
+  }
 
   return 0;
 }
 
 static int show_main(int argc, const char *argv[]) {
-  if (Filename == OutputFilename) {
-    errs() << sys::path::filename(argv[0]) << " " << argv[1]
-           << ": Input file name cannot be the same as the output file name!\n";
-    return 1;
-  }
-
   std::error_code EC;
   raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
   if (EC)
@@ -240,29 +253,100 @@ static int show_main(int argc, const char *argv[]) {
   return 0;
 }
 
-int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) {
-  const char **argv = const_cast<const char **>(argvNonConst);
+static void parseArgs(int argc, char **argv) {
+  CGDataOptTable Tbl;
+  ToolName = argv[0];
+  llvm::BumpPtrAllocator A;
+  llvm::StringSaver Saver{A};
+  llvm::opt::InputArgList Args =
+      Tbl.parseArgs(argc, argv, OPT_UNKNOWN, Saver, [&](StringRef Msg) {
+        llvm::errs() << Msg << '\n';
+        std::exit(1);
+      });
+
+  if (Args.hasArg(OPT_help)) {
+    Tbl.printHelp(
+        llvm::outs(),
+        "llvm-cgdata [options] <action> (<binary files>|<.cgdata file>)",
+        ToolName.c_str());
+    std::exit(0);
+  }
+  if (Args.hasArg(OPT_version)) {
+    cl::PrintVersionMessage();
+    std::exit(0);
+  }
 
-  StringRef ProgName(sys::path::filename(argv[0]));
+  ShowCGDataVersion = Args.hasArg(OPT_cgdata_version);
+
+  auto parseFormat = [](const StringRef FT) {
+    return StringSwitch<CGDataFormat>(FT)
+        .Case("text", CGDataFormat::Text)
+        .Case("binary", CGDataFormat::Binary)
+        .Default(CGDataFormat::Invalid);
+  };
+  if (opt::Arg *A = Args.getLastArg(OPT_format)) {
+    StringRef OF = A->getValue();
+    OutputFormat = parseFormat(OF);
+    if (OutputFormat == CGDataFormat::Invalid)
+      exitWithError("unsupported format '" + OF + "'");
+  }
 
-  if (argc < 2) {
-    errs() << ProgName
-           << ": No subcommand specified! Run llvm-cgdata --help for usage.\n";
-    return 1;
+  InputFilenames = Args.getAllArgValues(OPT_INPUT);
+  if (InputFilenames.empty())
+    exitWithError("No input file is specified.");
+  Filename = InputFilenames[0];
+
+  if (Args.hasArg(OPT_output)) {
+    OutputFilename = Args.getLastArgValue(OPT_output);
+    for (auto &Filename : InputFilenames)
+      if (Filename == OutputFilename)
+        exitWithError(
+            "Input file name cannot be the same as the output file name!\n");
   }
 
-  cl::ParseCommandLineOptions(argc, argv, "LLVM codegen data\n");
+  SmallVector<opt::Arg *, 1> ActionArgs(Args.filtered(OPT_action_group));
+  if (ActionArgs.size() != 1)
+    exitWithError("Only one action is required.");
+
+  switch (ActionArgs[0]->getOption().getID()) {
+  case OPT_show:
+    if (InputFilenames.size() != 1)
+      exitWithError("only one input file is allowed.");
+    Action = CGDataAction::Show;
+    break;
+  case OPT_convert:
+    // The default output format is text for convert.
+    if (!OutputFormat)
+      OutputFormat = CGDataFormat::Text;
+    if (InputFilenames.size() != 1)
+      exitWithError("only one input file is allowed.");
+    Action = CGDataAction::Convert;
+    break;
+  case OPT_merge:
+    // The default output format is binary for merge.
+    if (!OutputFormat)
+      OutputFormat = CGDataFormat::Binary;
+    Action = CGDataAction::Merge;
+    break;
+  default:
+    llvm_unreachable("unrecognized action");
+  }
+}
 
-  if (DumpSubcommand)
-    return dump_main(argc, argv);
+int llvm_cgdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) {
+  const char **argv = const_cast<const char **>(argvNonConst);
+  parseArgs(argc, argvNonConst);
 
-  if (MergeSubcommand)
+  switch (Action) {
+  case CGDataAction::Convert:
+    return convert_main(argc, argv);
+  case CGDataAction::Merge:
     return merge_main(argc, argv);
-
-  if (ShowSubcommand)
+  case CGDataAction::Show:
     return show_main(argc, argv);
+  default:
+    llvm_unreachable("unrecognized action");
+  }
 
-  errs() << ProgName
-         << ": Unknown command. Run llvm-cgdata --help for usage.\n";
   return 1;
 }

>From 74125fa5ef3e730e53a2685ad7d930a8c685fff9 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Thu, 1 Aug 2024 11:44:39 -0700
Subject: [PATCH 09/12] Address comments from MaskRay

---
 .../llvm/CodeGenData/CodeGenDataReader.h      |  2 +-
 llvm/test/tools/llvm-cgdata/convert.test      |  8 ++---
 llvm/test/tools/llvm-cgdata/empty.test        | 12 +++----
 llvm/test/tools/llvm-cgdata/error.test        | 10 +++---
 .../test/tools/llvm-cgdata/merge-archive.test | 10 +++---
 llvm/test/tools/llvm-cgdata/merge-concat.test | 10 +++---
 llvm/test/tools/llvm-cgdata/merge-double.test | 10 +++---
 llvm/test/tools/llvm-cgdata/merge-single.test | 10 +++---
 llvm/test/tools/llvm-cgdata/show.test         |  6 ++--
 llvm/tools/llvm-cgdata/Opts.td                | 32 +++++++++++--------
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp        | 10 +++---
 11 files changed, 62 insertions(+), 58 deletions(-)

diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
index 87602adcaf0a56..24b7a40ce9ff78 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
@@ -53,7 +53,7 @@ class CodeGenDataReader {
 
   /// Extract the cgdata embedded in sections from the given object file and
   /// merge them into the GlobalOutlineRecord. This is a static helper that
-  /// is used by `llvm-cgdata -merge` or ThinLTO's two-codegen rounds.
+  /// is used by `llvm-cgdata --merge` or ThinLTO's two-codegen rounds.
   static Error mergeFromObjectFile(const object::ObjectFile *Obj,
                                    OutlinedHashTreeRecord &GlobalOutlineRecord);
 
diff --git a/llvm/test/tools/llvm-cgdata/convert.test b/llvm/test/tools/llvm-cgdata/convert.test
index 0227bc5116cac7..632a7366d56a05 100644
--- a/llvm/test/tools/llvm-cgdata/convert.test
+++ b/llvm/test/tools/llvm-cgdata/convert.test
@@ -2,10 +2,10 @@
 
 RUN: split-file %s %t
 
-RUN: llvm-cgdata -convert -format binary %t/dump.cgtext -o %t/dump.cgdata
-RUN: llvm-cgdata -convert -format text %t/dump.cgdata -o %t/dump-round.cgtext
-RUN: llvm-cgdata -convert -format binary %t/dump-round.cgtext -o %t/dump-round.cgdata
-RUN: llvm-cgdata -convert -format text %t/dump-round.cgtext -o %t/dump-round-round.cgtext
+RUN: llvm-cgdata --convert --format binary %t/dump.cgtext -o %t/dump.cgdata
+RUN: llvm-cgdata --convert --format text %t/dump.cgdata -o %t/dump-round.cgtext
+RUN: llvm-cgdata -c -f binary %t/dump-round.cgtext -o %t/dump-round.cgdata
+RUN: llvm-cgdata -c -f text %t/dump-round.cgtext -o %t/dump-round-round.cgtext
 RUN: diff %t/dump.cgdata %t/dump-round.cgdata
 RUN: diff %t/dump-round.cgtext %t/dump-round-round.cgtext
 
diff --git a/llvm/test/tools/llvm-cgdata/empty.test b/llvm/test/tools/llvm-cgdata/empty.test
index cb0af2480244ed..70d5ea4b800630 100644
--- a/llvm/test/tools/llvm-cgdata/empty.test
+++ b/llvm/test/tools/llvm-cgdata/empty.test
@@ -1,25 +1,25 @@
 # Test no input file
-RUN: not llvm-cgdata -convert -o - 2>&1 | FileCheck %s --check-prefix=NOFILE --ignore-case
+RUN: not llvm-cgdata --convert --output - 2>&1 | FileCheck %s --check-prefix=NOFILE --ignore-case
 NOFILE: error: No input file is specified.
 
 # Test for empty cgdata file, which is invalid.
 RUN: touch %t_emptyfile.cgtext
-RUN: not llvm-cgdata -convert %t_emptyfile.cgtext -format text 2>&1 | FileCheck %s --check-prefix=EMPTY
+RUN: not llvm-cgdata --convert %t_emptyfile.cgtext --format text 2>&1 | FileCheck %s --check-prefix=EMPTY
 EMPTY: {{.}}emptyfile.cgtext: empty codegen data
 
 # Test for empty header in the text format. It can be converted to a valid binary file.
 RUN: printf '#' > %t_emptyheader.cgtext
-RUN: llvm-cgdata -convert %t_emptyheader.cgtext -format binary -o %t_emptyheader.cgdata
+RUN: llvm-cgdata --convert %t_emptyheader.cgtext --format binary -o %t_emptyheader.cgdata
 
 # Without any cgdata other than the header, no data shows by default.
-RUN: llvm-cgdata -show %t_emptyheader.cgdata | count 0
+RUN: llvm-cgdata --show %t_emptyheader.cgdata | count 0
 
 # The version number appears when asked, as it's in the header
-RUN: llvm-cgdata -show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
+RUN: llvm-cgdata --show --cgdata-version %t_emptyheader.cgdata | FileCheck %s --check-prefix=VERSION
 VERSION: Version: 1
 
 # When converting a binary file (w/ the header only) to a text file, it's an empty file as the text format does not have an explicit header.
-RUN: llvm-cgdata -convert %t_emptyheader.cgdata -format text | count 0
+RUN: llvm-cgdata --convert %t_emptyheader.cgdata --format text | count 0
 
 # Synthesize a header only cgdata.
 # struct Header {
diff --git a/llvm/test/tools/llvm-cgdata/error.test b/llvm/test/tools/llvm-cgdata/error.test
index ef53070b1da875..c992174505c1ad 100644
--- a/llvm/test/tools/llvm-cgdata/error.test
+++ b/llvm/test/tools/llvm-cgdata/error.test
@@ -8,17 +8,17 @@
 #   uint64_t OutlinedHashTreeOffset;
 # }
 RUN: touch %t_empty.cgdata
-RUN: not llvm-cgdata -show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix=EMPTY
+RUN: not llvm-cgdata --show %t_empty.cgdata 2>&1 | FileCheck %s --check-prefix=EMPTY
 EMPTY: {{.}}cgdata: empty codegen data
 
 # Not a magic.
 RUN: printf '\xff' > %t_malformed.cgdata
-RUN: not llvm-cgdata -show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix=MALFORMED
+RUN: not llvm-cgdata --show %t_malformed.cgdata 2>&1 | FileCheck %s --check-prefix=MALFORMED
 MALFORMED: {{.}}cgdata: malformed codegen data
 
 # The minimum header size is 24.
 RUN: printf '\xffcgdata\x81' > %t_corrupt.cgdata
-RUN: not llvm-cgdata -show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
+RUN: not llvm-cgdata --show %t_corrupt.cgdata 2>&1 | FileCheck %s  --check-prefix=CORRUPT
 CORRUPT: {{.}}cgdata: invalid codegen data (file header is corrupt)
 
 # The current version 1 while the header says 2.
@@ -26,7 +26,7 @@ RUN: printf '\xffcgdata\x81' > %t_version.cgdata
 RUN: printf '\x02\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x00\x00\x00\x00' >> %t_version.cgdata
 RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_version.cgdata
-RUN: not llvm-cgdata -show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix=BAD_VERSION
+RUN: not llvm-cgdata --show %t_version.cgdata 2>&1 | FileCheck %s  --check-prefix=BAD_VERSION
 BAD_VERSION: {{.}}cgdata: unsupported codegen data version
 
 # Header says an outlined hash tree, but the file ends after the header.
@@ -34,5 +34,5 @@ RUN: printf '\xffcgdata\x81' > %t_eof.cgdata
 RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
 RUN: printf '\x01\x00\x00\x00' >> %t_eof.cgdata
 RUN: printf '\x18\x00\x00\x00\x00\x00\x00\x00' >> %t_eof.cgdata
-RUN: not llvm-cgdata -show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix=EOF
+RUN: not llvm-cgdata --show %t_eof.cgdata 2>&1 | FileCheck %s  --check-prefix=EOF
 EOF: {{.}}cgdata: end of File
diff --git a/llvm/test/tools/llvm-cgdata/merge-archive.test b/llvm/test/tools/llvm-cgdata/merge-archive.test
index 9f778d074bd072..d70ac7c3c938d8 100644
--- a/llvm/test/tools/llvm-cgdata/merge-archive.test
+++ b/llvm/test/tools/llvm-cgdata/merge-archive.test
@@ -6,13 +6,13 @@
 RUN: split-file %s %t
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata -convert -format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata -convert -format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
@@ -21,14 +21,14 @@ RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 RUN: llvm-ar rcs %t/merge-archive.a %t/merge-1.o %t/merge-2.o
 
 # Merge the archive into the codegen data file.
-RUN: llvm-cgdata -merge %t/merge-archive.a -o %t/merge-archive.cgdata
-RUN: llvm-cgdata -show %t/merge-archive.cgdata | FileCheck %s
+RUN: llvm-cgdata --merge %t/merge-archive.a -o %t/merge-archive.cgdata
+RUN: llvm-cgdata --show %t/merge-archive.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata -convert %t/merge-archive.cgdata | FileCheck %s --check-prefix=TREE
+RUN: llvm-cgdata --convert %t/merge-archive.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
diff --git a/llvm/test/tools/llvm-cgdata/merge-concat.test b/llvm/test/tools/llvm-cgdata/merge-concat.test
index 6da8a102b01697..cc39c673cf9a5e 100644
--- a/llvm/test/tools/llvm-cgdata/merge-concat.test
+++ b/llvm/test/tools/llvm-cgdata/merge-concat.test
@@ -7,22 +7,22 @@ RUN: split-file %s %t
 
 # Synthesize two sets of raw cgdata without the header (24 byte) from the indexed cgdata.
 # Concatenate them in merge-concat.ll
-RUN: llvm-cgdata -convert -format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-concat.ll
-RUN: llvm-cgdata -convert -format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-concat.ll
 
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-concat.ll -o %t/merge-concat.o
-RUN: llvm-cgdata -merge %t/merge-concat.o -o %t/merge-concat.cgdata
-RUN: llvm-cgdata -show %t/merge-concat.cgdata | FileCheck %s
+RUN: llvm-cgdata --merge %t/merge-concat.o -o %t/merge-concat.cgdata
+RUN: llvm-cgdata --show %t/merge-concat.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata -convert %t/merge-concat.cgdata | FileCheck %s --check-prefix=TREE
+RUN: llvm-cgdata --convert %t/merge-concat.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
diff --git a/llvm/test/tools/llvm-cgdata/merge-double.test b/llvm/test/tools/llvm-cgdata/merge-double.test
index e55d3cfa11020e..950a88c66f7bb4 100644
--- a/llvm/test/tools/llvm-cgdata/merge-double.test
+++ b/llvm/test/tools/llvm-cgdata/merge-double.test
@@ -6,27 +6,27 @@
 RUN: split-file %s %t
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata -convert -format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
+RUN: llvm-cgdata --convert --format binary %t/raw-1.cgtext -o %t/raw-1.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-1.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-1-bytes.txt
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-1-bytes.txt)/g" %t/merge-1.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-1.ll -o %t/merge-1.o
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata -convert -format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
+RUN: llvm-cgdata --convert --format binary %t/raw-2.cgtext -o %t/raw-2.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-2.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-2-bytes.txt
 RUN: sed -ie "s/<RAW_2_BYTES>/$(cat %t/raw-2-bytes.txt)/g" %t/merge-2.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-2.ll -o %t/merge-2.o
 
 # Merge two object files into the codegen data file.
-RUN: llvm-cgdata -merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
+RUN: llvm-cgdata --merge %t/merge-1.o %t/merge-2.o -o %t/merge.cgdata
 
-RUN: llvm-cgdata -show %t/merge.cgdata | FileCheck %s
+RUN: llvm-cgdata --show %t/merge.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 4
 CHECK-NEXT:  Terminal Node Count: 2
 CHECK-NEXT:  Depth: 2
 
-RUN: llvm-cgdata -convert %t/merge.cgdata | FileCheck %s --check-prefix=TREE
+RUN: llvm-cgdata --convert %t/merge.cgdata | FileCheck %s --check-prefix=TREE
 TREE: # Outlined stable hash tree
 TREE-NEXT: :outlined_hash_tree
 TREE-NEXT: ---
diff --git a/llvm/test/tools/llvm-cgdata/merge-single.test b/llvm/test/tools/llvm-cgdata/merge-single.test
index 1f44fc06149581..783c7b979f541e 100644
--- a/llvm/test/tools/llvm-cgdata/merge-single.test
+++ b/llvm/test/tools/llvm-cgdata/merge-single.test
@@ -7,20 +7,20 @@ RUN: split-file %s %t
 
 # Merge an object file that has no cgdata (__llvm_outline). It still produces a header only cgdata.
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-empty.ll -o %t/merge-empty.o
-RUN: llvm-cgdata -merge %t/merge-empty.o -o %t/merge-empty.cgdata
+RUN: llvm-cgdata --merge %t/merge-empty.o --output %t/merge-empty.cgdata
 # No summary appear with the header only cgdata.
-RUN: llvm-cgdata -show %t/merge-empty.cgdata | count 0
+RUN: llvm-cgdata --show %t/merge-empty.cgdata | count 0
 
 # Synthesize raw cgdata without the header (24 byte) from the indexed cgdata.
-RUN: llvm-cgdata -convert -format binary %t/raw-single.cgtext -o %t/raw-single.cgdata
+RUN: llvm-cgdata --convert --format binary %t/raw-single.cgtext -o %t/raw-single.cgdata
 RUN: od -t x1 -j 24 -An %t/raw-single.cgdata | tr -d '\n\r\t' | sed 's/[ ]*$//' | sed 's/[ ][ ]*/\\\\/g' > %t/raw-single-bytes.txt
 
 RUN: sed -ie "s/<RAW_1_BYTES>/$(cat %t/raw-single-bytes.txt)/g" %t/merge-single.ll
 RUN: llc -filetype=obj -mtriple arm64-apple-darwin %t/merge-single.ll -o %t/merge-single.o
 
 # Merge an object file having cgdata (__llvm_outline)
-RUN: llvm-cgdata -merge %t/merge-single.o -o %t/merge-single.cgdata
-RUN: llvm-cgdata -show %t/merge-single.cgdata | FileCheck %s
+RUN: llvm-cgdata -m %t/merge-single.o -o %t/merge-single.cgdata
+RUN: llvm-cgdata -s %t/merge-single.cgdata | FileCheck %s
 CHECK: Outlined hash tree:
 CHECK-NEXT:  Total Node Count: 3
 CHECK-NEXT:  Terminal Node Count: 1
diff --git a/llvm/test/tools/llvm-cgdata/show.test b/llvm/test/tools/llvm-cgdata/show.test
index fc57229647bd30..b47ad4978ef0b3 100644
--- a/llvm/test/tools/llvm-cgdata/show.test
+++ b/llvm/test/tools/llvm-cgdata/show.test
@@ -1,7 +1,7 @@
 # Test show
 
 RUN: split-file %s %t
-RUN: llvm-cgdata -show %t/show.cgtext | FileCheck %s
+RUN: llvm-cgdata --show %t/show.cgtext | FileCheck %s
 
 CHECK: Outlined hash tree:
 CHECK-NEXT:   Total Node Count: 3
@@ -9,8 +9,8 @@ CHECK-NEXT:   Terminal Node Count: 1
 CHECK-NEXT:   Depth: 2
 
 # Convert the text file to the binary file
-RUN: llvm-cgdata -convert -format binary %t/show.cgtext -o %t/show.cgdata
-RUN: llvm-cgdata -show %t/show.cgdata | FileCheck %s
+RUN: llvm-cgdata --convert --format binary %t/show.cgtext -o %t/show.cgdata
+RUN: llvm-cgdata --show %t/show.cgdata | FileCheck %s
 
 ;--- show.cgtext
 :outlined_hash_tree
diff --git a/llvm/tools/llvm-cgdata/Opts.td b/llvm/tools/llvm-cgdata/Opts.td
index c3b374e8e24350..b2cfc6a85bbd32 100644
--- a/llvm/tools/llvm-cgdata/Opts.td
+++ b/llvm/tools/llvm-cgdata/Opts.td
@@ -1,28 +1,32 @@
 include "llvm/Option/OptParser.td"
 
 class F<string letter, string help> : Flag<["-"], letter>, HelpText<help>;
+class FF<string name, string help> : Flag<["--"], name>, HelpText<help>;
 
 // General options
 def generic_group : OptionGroup<"Genric Options">, HelpText<"Generic Options">;
-def help : F<"help", "Display this help">, Group<generic_group>;
-def : Flag<["-"], "h">, Alias<help>, HelpText<"Alias for --help">, Group<generic_group>;
-def version : F<"version", "Display the version">, Group<generic_group>;
-def : Flag<["-"], "v">, Alias<version>, HelpText<"Alias for --version">, Group<generic_group>;
+def help : FF<"help", "Display this help">, Group<generic_group>;
+def : F<"h", "Alias for --help">, Alias<help>, Group<generic_group>;
+def version : FF<"version", "Display the LLVM version">, Group<generic_group>;
+def : F<"v", "Alias for --version">, Alias<version>, Group<generic_group>;
 
 // Action options
 def action_group : OptionGroup<"Action">, HelpText<"Action (required)">;
-def show : F<"show", "Show summary of the (indexed) codegen data file.">,
+def show : FF<"show", "Show summary of the (indexed) codegen data file.">,
   Group<action_group>;
-def convert : F<"convert", "Convert the (indexed) codegen data file in either text or binary format.">,
+def : F<"s", "Alias for --show">, Alias<show>, Group<action_group>;
+def convert : FF<"convert", "Convert the (indexed) codegen data file in either text or binary format.">,
   Group<action_group>;
-def merge : F<"merge", "Takes binary files having raw codegen data in custom sections, and merge them into an index codegen data file.">,
+def : F<"c", "Alias for --convert">, Alias<convert>, Group<action_group>;
+def merge : FF<"merge", "Take binary files having raw codegen data in custom sections, and merge them into an indexed codegen data file.">,
   Group<action_group>;
-def cgdata_version : Flag<["-", "--"], "cgdata-version">, HelpText<"Display the cgdata version">;
+def : F<"m", "Alias for --merge">, Alias<merge>, Group<action_group>;
 
-// Output options
-def output : Option<["-", "--"], "output", KIND_SEPARATE>,
-             HelpText<"Create output file with specified name">, MetaVarName<"<file>">;
-def o : JoinedOrSeparate<["-"], "o">, Alias<output>;
-def format : Option<["-", "--"], "format", KIND_SEPARATE>,
+// Additional options
+def cgdata_version : FF<"cgdata-version", "Display the cgdata version">;
+def output : Option<["--"], "output", KIND_SEPARATE>,
+             HelpText<"Specify the name for the output file to be created">, MetaVarName<"<file>">;
+def : JoinedOrSeparate<["-"], "o">, Alias<output>, MetaVarName<"<file>">, HelpText<"Alias for --output">;
+def format : Option<["--"], "format", KIND_SEPARATE>,
              HelpText<"Specify the output format (text or binary)">, MetaVarName<"<value>">;
-def f : JoinedOrSeparate<["-"], "f">, Alias<format>;
+def : JoinedOrSeparate<["-"], "f">, Alias<format>, HelpText<"Alias for --format">;
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 598e4bbac60a30..1aab2177ef7e66 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -72,9 +72,9 @@ class CGDataOptTable : public opt::GenericOptTable {
 } // end anonymous namespace
 
 // Options
-static std::string ToolName;
-static std::string OutputFilename = "-";
-static std::string Filename;
+static StringRef ToolName;
+static StringRef OutputFilename = "-";
+static StringRef Filename;
 static bool ShowCGDataVersion;
 static CGDataAction Action;
 static std::optional<CGDataFormat> OutputFormat;
@@ -267,8 +267,8 @@ static void parseArgs(int argc, char **argv) {
   if (Args.hasArg(OPT_help)) {
     Tbl.printHelp(
         llvm::outs(),
-        "llvm-cgdata [options] <action> (<binary files>|<.cgdata file>)",
-        ToolName.c_str());
+        "llvm-cgdata <action> [options] (<binary files>|<.cgdata file>)",
+        ToolName.str().c_str());
     std::exit(0);
   }
   if (Args.hasArg(OPT_version)) {

>From c1b56a773bae9a97b4b3cdf1a7b525ccce151b09 Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Mon, 5 Aug 2024 17:01:35 -0700
Subject: [PATCH 10/12] Address comments from ellishg #3

---
 llvm/lib/CodeGenData/CodeGenDataWriter.cpp |  3 ++-
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp     | 24 ++++++++++++----------
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
index 3c91a1b3034503..4e6fc459b19b80 100644
--- a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
@@ -67,7 +67,8 @@ class CGDataOStream {
           uint64_t Bytes =
               endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
           Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
-                       (const char *)&Bytes, sizeof(uint64_t));
+                       reinterpret_cast<const char *>(&Bytes),
+                       sizeof(uint64_t));
         }
       }
     }
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 1aab2177ef7e66..9cdbce71946378 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -278,15 +278,12 @@ static void parseArgs(int argc, char **argv) {
 
   ShowCGDataVersion = Args.hasArg(OPT_cgdata_version);
 
-  auto parseFormat = [](const StringRef FT) {
-    return StringSwitch<CGDataFormat>(FT)
-        .Case("text", CGDataFormat::Text)
-        .Case("binary", CGDataFormat::Binary)
-        .Default(CGDataFormat::Invalid);
-  };
   if (opt::Arg *A = Args.getLastArg(OPT_format)) {
     StringRef OF = A->getValue();
-    OutputFormat = parseFormat(OF);
+    OutputFormat = StringSwitch<CGDataFormat>(OF)
+                       .Case("text", CGDataFormat::Text)
+                       .Case("binary", CGDataFormat::Binary)
+                       .Default(CGDataFormat::Invalid);
     if (OutputFormat == CGDataFormat::Invalid)
       exitWithError("unsupported format '" + OF + "'");
   }
@@ -304,11 +301,16 @@ static void parseArgs(int argc, char **argv) {
             "Input file name cannot be the same as the output file name!\n");
   }
 
-  SmallVector<opt::Arg *, 1> ActionArgs(Args.filtered(OPT_action_group));
-  if (ActionArgs.size() != 1)
-    exitWithError("Only one action is required.");
+  opt::Arg *ActionArg = nullptr;
+  for (opt::Arg *Arg : Args.filtered(OPT_action_group)) {
+    if (ActionArg)
+      exitWithError("Only one action is allowed.");
+    ActionArg = Arg;
+  }
+  if (!ActionArg)
+    exitWithError("One action is required.");
 
-  switch (ActionArgs[0]->getOption().getID()) {
+  switch (ActionArg->getOption().getID()) {
   case OPT_show:
     if (InputFilenames.size() != 1)
       exitWithError("only one input file is allowed.");

>From 4478278a558408c6bf1dc06c76bbe89d6fa60e2e Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Mon, 5 Aug 2024 20:26:16 -0700
Subject: [PATCH 11/12] Address comments from MaskRay #2

---
 .../llvm/CodeGenData/CodeGenDataWriter.h      | 34 ++++++-
 llvm/lib/CodeGenData/CodeGenDataWriter.cpp    | 88 ++++++-------------
 2 files changed, 58 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
index e17ffc3482ec91..7e6c3eed5a84b0 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
+++ b/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
@@ -15,11 +15,43 @@
 
 #include "llvm/CodeGenData/CodeGenData.h"
 #include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 
-class CGDataOStream;
+/// A struct to define how the data stream should be patched.
+struct CGDataPatchItem {
+  uint64_t Pos; // Where to patch.
+  uint64_t *D;  // Pointer to an array of source data.
+  int N;        // Number of elements in \c D array.
+};
+
+/// A wrapper class to abstract writer stream with support of bytes
+/// back patching.
+class CGDataOStream {
+public:
+  CGDataOStream(raw_fd_ostream &FD)
+      : IsFDOStream(true), OS(FD), LE(FD, llvm::endianness::little) {}
+  CGDataOStream(raw_string_ostream &STR)
+      : IsFDOStream(false), OS(STR), LE(STR, llvm::endianness::little) {}
+
+  uint64_t tell() { return OS.tell(); }
+  void write(uint64_t V) { LE.write<uint64_t>(V); }
+  void write32(uint32_t V) { LE.write<uint32_t>(V); }
+  void write8(uint8_t V) { LE.write<uint8_t>(V); }
+
+  // \c patch can only be called when all data is written and flushed.
+  // For raw_string_ostream, the patch is done on the target string
+  // directly and it won't be reflected in the stream's internal buffer.
+  void patch(ArrayRef<CGDataPatchItem> P);
+
+  // If \c OS is an instance of \c raw_fd_ostream, this field will be
+  // true. Otherwise, \c OS will be an raw_string_ostream.
+  bool IsFDOStream;
+  raw_ostream &OS;
+  support::endian::Writer LE;
+};
 
 class CodeGenDataWriter {
   /// The outlined hash tree to be written.
diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
index 4e6fc459b19b80..a60963b8b203b6 100644
--- a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
@@ -11,77 +11,39 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGenData/CodeGenDataWriter.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/EndianStream.h"
 
 #define DEBUG_TYPE "cg-data-writer"
 
 using namespace llvm;
 
-namespace llvm {
-
-/// A struct to define how the data stream should be patched.
-struct CGDataPatchItem {
-  uint64_t Pos; // Where to patch.
-  uint64_t *D;  // Pointer to an array of source data.
-  int N;        // Number of elements in \c D array.
-};
-
-// A wrapper class to abstract writer stream with support of bytes
-// back patching.
-class CGDataOStream {
-public:
-  CGDataOStream(raw_fd_ostream &FD)
-      : IsFDOStream(true), OS(FD), LE(FD, llvm::endianness::little) {}
-  CGDataOStream(raw_string_ostream &STR)
-      : IsFDOStream(false), OS(STR), LE(STR, llvm::endianness::little) {}
-
-  uint64_t tell() { return OS.tell(); }
-  void write(uint64_t V) { LE.write<uint64_t>(V); }
-  void write32(uint32_t V) { LE.write<uint32_t>(V); }
-  void write8(uint8_t V) { LE.write<uint8_t>(V); }
-
-  // \c patch can only be called when all data is written and flushed.
-  // For raw_string_ostream, the patch is done on the target string
-  // directly and it won't be reflected in the stream's internal buffer.
-  void patch(ArrayRef<CGDataPatchItem> P) {
-    using namespace support;
-
-    if (IsFDOStream) {
-      raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
-      const uint64_t LastPos = FDOStream.tell();
-      for (const auto &K : P) {
-        FDOStream.seek(K.Pos);
-        for (int I = 0; I < K.N; I++)
-          write(K.D[I]);
-      }
-      // Reset the stream to the last position after patching so that users
-      // don't accidentally overwrite data. This makes it consistent with
-      // the string stream below which replaces the data directly.
-      FDOStream.seek(LastPos);
-    } else {
-      raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
-      std::string &Data = SOStream.str(); // with flush
-      for (const auto &K : P) {
-        for (int I = 0; I < K.N; I++) {
-          uint64_t Bytes =
-              endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
-          Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
-                       reinterpret_cast<const char *>(&Bytes),
-                       sizeof(uint64_t));
-        }
+void CGDataOStream::patch(ArrayRef<CGDataPatchItem> P) {
+  using namespace support;
+
+  if (IsFDOStream) {
+    raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
+    const uint64_t LastPos = FDOStream.tell();
+    for (const auto &K : P) {
+      FDOStream.seek(K.Pos);
+      for (int I = 0; I < K.N; I++)
+        write(K.D[I]);
+    }
+    // Reset the stream to the last position after patching so that users
+    // don't accidentally overwrite data. This makes it consistent with
+    // the string stream below which replaces the data directly.
+    FDOStream.seek(LastPos);
+  } else {
+    raw_string_ostream &SOStream = static_cast<raw_string_ostream &>(OS);
+    std::string &Data = SOStream.str(); // with flush
+    for (const auto &K : P) {
+      for (int I = 0; I < K.N; I++) {
+        uint64_t Bytes =
+            endian::byte_swap<uint64_t, llvm::endianness::little>(K.D[I]);
+        Data.replace(K.Pos + I * sizeof(uint64_t), sizeof(uint64_t),
+                     reinterpret_cast<const char *>(&Bytes), sizeof(uint64_t));
       }
     }
   }
-
-  // If \c OS is an instance of \c raw_fd_ostream, this field will be
-  // true. Otherwise, \c OS will be an raw_string_ostream.
-  bool IsFDOStream;
-  raw_ostream &OS;
-  support::endian::Writer LE;
-};
-
-} // end namespace llvm
+}
 
 void CodeGenDataWriter::addRecord(OutlinedHashTreeRecord &Record) {
   assert(Record.HashTree && "empty hash tree in the record");

>From 36f01e2d729eeecabcb0f0bf2f4a25033d87934b Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee at meta.com>
Date: Fri, 16 Aug 2024 10:24:30 -0700
Subject: [PATCH 12/12] Address comments from MaskRay #3

 - Rename the library name, `CodeGenData` to `CGData`
---
 .../include/llvm/{CodeGenData => CGData}/CodeGenData.h | 10 +++++-----
 .../llvm/{CodeGenData => CGData}/CodeGenData.inc       |  0
 .../llvm/{CodeGenData => CGData}/CodeGenDataReader.h   | 10 +++++-----
 .../llvm/{CodeGenData => CGData}/CodeGenDataWriter.h   | 10 +++++-----
 .../llvm/{CodeGenData => CGData}/OutlinedHashTree.h    |  4 ++--
 .../{CodeGenData => CGData}/OutlinedHashTreeRecord.h   |  8 ++++----
 llvm/lib/{CodeGenData => CGData}/CMakeLists.txt        |  4 ++--
 llvm/lib/{CodeGenData => CGData}/CodeGenData.cpp       | 10 +++++-----
 llvm/lib/{CodeGenData => CGData}/CodeGenDataReader.cpp |  4 ++--
 llvm/lib/{CodeGenData => CGData}/CodeGenDataWriter.cpp |  2 +-
 llvm/lib/{CodeGenData => CGData}/OutlinedHashTree.cpp  |  2 +-
 .../{CodeGenData => CGData}/OutlinedHashTreeRecord.cpp |  2 +-
 llvm/lib/CMakeLists.txt                                |  2 +-
 llvm/tools/llvm-cgdata/CMakeLists.txt                  |  2 +-
 llvm/tools/llvm-cgdata/llvm-cgdata.cpp                 |  4 ++--
 llvm/unittests/{CodeGenData => CGData}/CMakeLists.txt  |  2 +-
 .../OutlinedHashTreeRecordTest.cpp                     |  2 +-
 .../{CodeGenData => CGData}/OutlinedHashTreeTest.cpp   |  2 +-
 llvm/unittests/CMakeLists.txt                          |  2 +-
 19 files changed, 41 insertions(+), 41 deletions(-)
 rename llvm/include/llvm/{CodeGenData => CGData}/CodeGenData.h (96%)
 rename llvm/include/llvm/{CodeGenData => CGData}/CodeGenData.inc (100%)
 rename llvm/include/llvm/{CodeGenData => CGData}/CodeGenDataReader.h (96%)
 rename llvm/include/llvm/{CodeGenData => CGData}/CodeGenDataWriter.h (93%)
 rename llvm/include/llvm/{CodeGenData => CGData}/OutlinedHashTree.h (97%)
 rename llvm/include/llvm/{CodeGenData => CGData}/OutlinedHashTreeRecord.h (92%)
 rename llvm/lib/{CodeGenData => CGData}/CMakeLists.txt (71%)
 rename llvm/lib/{CodeGenData => CGData}/CodeGenData.cpp (96%)
 rename llvm/lib/{CodeGenData => CGData}/CodeGenDataReader.cpp (98%)
 rename llvm/lib/{CodeGenData => CGData}/CodeGenDataWriter.cpp (98%)
 rename llvm/lib/{CodeGenData => CGData}/OutlinedHashTree.cpp (98%)
 rename llvm/lib/{CodeGenData => CGData}/OutlinedHashTreeRecord.cpp (99%)
 rename llvm/unittests/{CodeGenData => CGData}/CMakeLists.txt (94%)
 rename llvm/unittests/{CodeGenData => CGData}/OutlinedHashTreeRecordTest.cpp (98%)
 rename llvm/unittests/{CodeGenData => CGData}/OutlinedHashTreeTest.cpp (98%)

diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.h b/llvm/include/llvm/CGData/CodeGenData.h
similarity index 96%
rename from llvm/include/llvm/CodeGenData/CodeGenData.h
rename to llvm/include/llvm/CGData/CodeGenData.h
index 659008c78abd93..84133a433170fe 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenData.h
+++ b/llvm/include/llvm/CGData/CodeGenData.h
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGENDATA_CODEGENDATA_H
-#define LLVM_CODEGENDATA_CODEGENDATA_H
+#ifndef LLVM_CGDATA_CODEGENDATA_H
+#define LLVM_CGDATA_CODEGENDATA_H
 
 #include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/CodeGenData/OutlinedHashTree.h"
-#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/OutlinedHashTree.h"
+#include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -28,7 +28,7 @@ namespace llvm {
 
 enum CGDataSectKind {
 #define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Kind,
-#include "llvm/CodeGenData/CodeGenData.inc"
+#include "llvm/CGData/CodeGenData.inc"
 };
 
 std::string getCodeGenDataSectionName(CGDataSectKind CGSK,
diff --git a/llvm/include/llvm/CodeGenData/CodeGenData.inc b/llvm/include/llvm/CGData/CodeGenData.inc
similarity index 100%
rename from llvm/include/llvm/CodeGenData/CodeGenData.inc
rename to llvm/include/llvm/CGData/CodeGenData.inc
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h b/llvm/include/llvm/CGData/CodeGenDataReader.h
similarity index 96%
rename from llvm/include/llvm/CodeGenData/CodeGenDataReader.h
rename to llvm/include/llvm/CGData/CodeGenDataReader.h
index 24b7a40ce9ff78..1ee4bfbe480233 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenDataReader.h
+++ b/llvm/include/llvm/CGData/CodeGenDataReader.h
@@ -10,11 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGENDATA_CODEGENDATAREADER_H
-#define LLVM_CODEGENDATA_CODEGENDATAREADER_H
+#ifndef LLVM_CGDATA_CODEGENDATAREADER_H
+#define LLVM_CGDATA_CODEGENDATAREADER_H
 
-#include "llvm/CodeGenData/CodeGenData.h"
-#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/CodeGenData.h"
+#include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/VirtualFileSystem.h"
 
@@ -151,4 +151,4 @@ class TextCodeGenDataReader : public CodeGenDataReader {
 
 } // end namespace llvm
 
-#endif // LLVM_CODEGENDATA_CODEGENDATAREADER_H
+#endif // LLVM_CGDATA_CODEGENDATAREADER_H
diff --git a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h b/llvm/include/llvm/CGData/CodeGenDataWriter.h
similarity index 93%
rename from llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
rename to llvm/include/llvm/CGData/CodeGenDataWriter.h
index 7e6c3eed5a84b0..5cb8377b1d07e5 100644
--- a/llvm/include/llvm/CodeGenData/CodeGenDataWriter.h
+++ b/llvm/include/llvm/CGData/CodeGenDataWriter.h
@@ -10,11 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGENDATA_CODEGENDATAWRITER_H
-#define LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+#ifndef LLVM_CGDATA_CODEGENDATAWRITER_H
+#define LLVM_CGDATA_CODEGENDATAWRITER_H
 
-#include "llvm/CodeGenData/CodeGenData.h"
-#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/CodeGenData.h"
+#include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
 
@@ -97,4 +97,4 @@ class CodeGenDataWriter {
 
 } // end namespace llvm
 
-#endif // LLVM_CODEGENDATA_CODEGENDATAWRITER_H
+#endif // LLVM_CGDATA_CODEGENDATAWRITER_H
diff --git a/llvm/include/llvm/CodeGenData/OutlinedHashTree.h b/llvm/include/llvm/CGData/OutlinedHashTree.h
similarity index 97%
rename from llvm/include/llvm/CodeGenData/OutlinedHashTree.h
rename to llvm/include/llvm/CGData/OutlinedHashTree.h
index 2c8a9288f8a8c7..9ab36df863eef0 100644
--- a/llvm/include/llvm/CodeGenData/OutlinedHashTree.h
+++ b/llvm/include/llvm/CGData/OutlinedHashTree.h
@@ -12,8 +12,8 @@
 //
 //===---------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGENDATA_OUTLINEDHASHTREE_H
-#define LLVM_CODEGENDATA_OUTLINEDHASHTREE_H
+#ifndef LLVM_CGDATA_OUTLINEDHASHTREE_H
+#define LLVM_CGDATA_OUTLINEDHASHTREE_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StableHashing.h"
diff --git a/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h b/llvm/include/llvm/CGData/OutlinedHashTreeRecord.h
similarity index 92%
rename from llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
rename to llvm/include/llvm/CGData/OutlinedHashTreeRecord.h
index de397c9ca5e70d..dd599ff6a7a624 100644
--- a/llvm/include/llvm/CodeGenData/OutlinedHashTreeRecord.h
+++ b/llvm/include/llvm/CGData/OutlinedHashTreeRecord.h
@@ -13,10 +13,10 @@
 //
 //===---------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
-#define LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
+#ifndef LLVM_CGDATA_OUTLINEDHASHTREERECORD_H
+#define LLVM_CGDATA_OUTLINEDHASHTREERECORD_H
 
-#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/CGData/OutlinedHashTree.h"
 
 namespace llvm {
 
@@ -72,4 +72,4 @@ struct OutlinedHashTreeRecord {
 
 } // end namespace llvm
 
-#endif // LLVM_CODEGENDATA_OUTLINEDHASHTREERECORD_H
+#endif // LLVM_CGDATA_OUTLINEDHASHTREERECORD_H
diff --git a/llvm/lib/CodeGenData/CMakeLists.txt b/llvm/lib/CGData/CMakeLists.txt
similarity index 71%
rename from llvm/lib/CodeGenData/CMakeLists.txt
rename to llvm/lib/CGData/CMakeLists.txt
index 0a231d6214fea1..ff1aab920e7a8c 100644
--- a/llvm/lib/CodeGenData/CMakeLists.txt
+++ b/llvm/lib/CGData/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_llvm_component_library(LLVMCodeGenData
+add_llvm_component_library(LLVMCGData
   CodeGenData.cpp
   CodeGenDataReader.cpp
   CodeGenDataWriter.cpp
@@ -6,7 +6,7 @@ add_llvm_component_library(LLVMCodeGenData
   OutlinedHashTreeRecord.cpp
 
   ADDITIONAL_HEADER_DIRS
-  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGenData
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CGData
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/lib/CodeGenData/CodeGenData.cpp b/llvm/lib/CGData/CodeGenData.cpp
similarity index 96%
rename from llvm/lib/CodeGenData/CodeGenData.cpp
rename to llvm/lib/CGData/CodeGenData.cpp
index 49b7447440959f..9dd4b1674e094a 100644
--- a/llvm/lib/CodeGenData/CodeGenData.cpp
+++ b/llvm/lib/CGData/CodeGenData.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/CodeGenData/CodeGenDataReader.h"
-#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/CodeGenDataReader.h"
+#include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
@@ -91,18 +91,18 @@ namespace {
 const char *CodeGenDataSectNameCommon[] = {
 #define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
   SectNameCommon,
-#include "llvm/CodeGenData/CodeGenData.inc"
+#include "llvm/CGData/CodeGenData.inc"
 };
 
 const char *CodeGenDataSectNameCoff[] = {
 #define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix)         \
   SectNameCoff,
-#include "llvm/CodeGenData/CodeGenData.inc"
+#include "llvm/CGData/CodeGenData.inc"
 };
 
 const char *CodeGenDataSectNamePrefix[] = {
 #define CG_DATA_SECT_ENTRY(Kind, SectNameCommon, SectNameCoff, Prefix) Prefix,
-#include "llvm/CodeGenData/CodeGenData.inc"
+#include "llvm/CGData/CodeGenData.inc"
 };
 
 } // namespace
diff --git a/llvm/lib/CodeGenData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
similarity index 98%
rename from llvm/lib/CodeGenData/CodeGenDataReader.cpp
rename to llvm/lib/CGData/CodeGenDataReader.cpp
index bcd61047079ffa..f7f3a8f42af7e1 100644
--- a/llvm/lib/CodeGenData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -10,8 +10,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGenData/CodeGenDataReader.h"
-#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/CodeGenDataReader.h"
+#include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/MemoryBuffer.h"
 
diff --git a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp b/llvm/lib/CGData/CodeGenDataWriter.cpp
similarity index 98%
rename from llvm/lib/CodeGenData/CodeGenDataWriter.cpp
rename to llvm/lib/CGData/CodeGenDataWriter.cpp
index a60963b8b203b6..5f638be0fefe74 100644
--- a/llvm/lib/CodeGenData/CodeGenDataWriter.cpp
+++ b/llvm/lib/CGData/CodeGenDataWriter.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/CGData/CodeGenDataWriter.h"
 
 #define DEBUG_TYPE "cg-data-writer"
 
diff --git a/llvm/lib/CodeGenData/OutlinedHashTree.cpp b/llvm/lib/CGData/OutlinedHashTree.cpp
similarity index 98%
rename from llvm/lib/CodeGenData/OutlinedHashTree.cpp
rename to llvm/lib/CGData/OutlinedHashTree.cpp
index d64098098de62b..7bf8168e5afa17 100644
--- a/llvm/lib/CodeGenData/OutlinedHashTree.cpp
+++ b/llvm/lib/CGData/OutlinedHashTree.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/CGData/OutlinedHashTree.h"
 
 #define DEBUG_TYPE "outlined-hash-tree"
 
diff --git a/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
similarity index 99%
rename from llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
rename to llvm/lib/CGData/OutlinedHashTreeRecord.cpp
index d3c67904083888..d1d57fe3fc9f4c 100644
--- a/llvm/lib/CodeGenData/OutlinedHashTreeRecord.cpp
+++ b/llvm/lib/CGData/OutlinedHashTreeRecord.cpp
@@ -13,7 +13,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "llvm/ObjectYAML/YAML.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
diff --git a/llvm/lib/CMakeLists.txt b/llvm/lib/CMakeLists.txt
index 638c3bd6f90f53..503c77cb13bd07 100644
--- a/llvm/lib/CMakeLists.txt
+++ b/llvm/lib/CMakeLists.txt
@@ -9,8 +9,8 @@ add_subdirectory(FileCheck)
 add_subdirectory(InterfaceStub)
 add_subdirectory(IRPrinter)
 add_subdirectory(IRReader)
+add_subdirectory(CGData)
 add_subdirectory(CodeGen)
-add_subdirectory(CodeGenData)
 add_subdirectory(CodeGenTypes)
 add_subdirectory(BinaryFormat)
 add_subdirectory(Bitcode)
diff --git a/llvm/tools/llvm-cgdata/CMakeLists.txt b/llvm/tools/llvm-cgdata/CMakeLists.txt
index 275fa107a8bbb9..556bc388306a3c 100644
--- a/llvm/tools/llvm-cgdata/CMakeLists.txt
+++ b/llvm/tools/llvm-cgdata/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(LLVM_LINK_COMPONENTS
+  CGData
   CodeGen
-  CodeGenData
   Core
   Object
   Option
diff --git a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
index 9cdbce71946378..3104242070f34e 100644
--- a/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
+++ b/llvm/tools/llvm-cgdata/llvm-cgdata.cpp
@@ -13,8 +13,8 @@
 //
 //===----------------------------------------------------------------------===//
 #include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGenData/CodeGenDataReader.h"
-#include "llvm/CodeGenData/CodeGenDataWriter.h"
+#include "llvm/CGData/CodeGenDataReader.h"
+#include "llvm/CGData/CodeGenDataWriter.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
diff --git a/llvm/unittests/CodeGenData/CMakeLists.txt b/llvm/unittests/CGData/CMakeLists.txt
similarity index 94%
rename from llvm/unittests/CodeGenData/CMakeLists.txt
rename to llvm/unittests/CGData/CMakeLists.txt
index 3d821b87e29d8c..9cedab56d3f6bc 100644
--- a/llvm/unittests/CodeGenData/CMakeLists.txt
+++ b/llvm/unittests/CGData/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
+  CGData
   CodeGen
-  CodeGenData
   Core
   Support
   )
diff --git a/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp b/llvm/unittests/CGData/OutlinedHashTreeRecordTest.cpp
similarity index 98%
rename from llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
rename to llvm/unittests/CGData/OutlinedHashTreeRecordTest.cpp
index aa7ad4a33754ff..a614a48dd7a439 100644
--- a/llvm/unittests/CodeGenData/OutlinedHashTreeRecordTest.cpp
+++ b/llvm/unittests/CGData/OutlinedHashTreeRecordTest.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGenData/OutlinedHashTreeRecord.h"
+#include "llvm/CGData/OutlinedHashTreeRecord.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp b/llvm/unittests/CGData/OutlinedHashTreeTest.cpp
similarity index 98%
rename from llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp
rename to llvm/unittests/CGData/OutlinedHashTreeTest.cpp
index 637ab3cd08c1ce..2d1ec8b05ab2a9 100644
--- a/llvm/unittests/CodeGenData/OutlinedHashTreeTest.cpp
+++ b/llvm/unittests/CGData/OutlinedHashTreeTest.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGenData/OutlinedHashTree.h"
+#include "llvm/CGData/OutlinedHashTree.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
diff --git a/llvm/unittests/CMakeLists.txt b/llvm/unittests/CMakeLists.txt
index 49ed6c8fb6c42f..911ede701982f6 100644
--- a/llvm/unittests/CMakeLists.txt
+++ b/llvm/unittests/CMakeLists.txt
@@ -20,8 +20,8 @@ add_subdirectory(AsmParser)
 add_subdirectory(BinaryFormat)
 add_subdirectory(Bitcode)
 add_subdirectory(Bitstream)
+add_subdirectory(CGData)
 add_subdirectory(CodeGen)
-add_subdirectory(CodeGenData)
 add_subdirectory(DebugInfo)
 add_subdirectory(Debuginfod)
 add_subdirectory(Demangle)